diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4117f077219d3b8fc097631073eafa748ff918bc..23bb27e77b9eab0c322a71a8ff570d12d1050377 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,8 +61,11 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
+option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
+option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
+option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -131,6 +134,10 @@ if (NOT DEFINED WITH_MKLDNN)
         set(WITH_MKLDNN OFF)
     endif()
 endif()
+
+if (REPLACE_ENFORCE_GLOG)
+  add_definitions("-DREPLACE_ENFORCE_GLOG")
+endif()
 ########################################################################################
 
 include(external/mklml)     # download mklml package
@@ -153,12 +160,24 @@ include(external/cares)
 if(WITH_DISTRIBUTE)
     if(WITH_GRPC)
         include(external/grpc)
+        message(STATUS "Use grpc framework.")
     else()
+        message(STATUS "Use brpc framework.")
         include(external/leveldb)
         include(external/brpc)
     endif()
 endif()
 
+if(WITH_BRPC_RDMA)
+    message(STATUS "Use brpc with rdma.")
+    if(WITH_GRPC)
+        message(FATAL_ERROR "Can't use grpc with brpc rdma.")
+    endif()
+    if(NOT WITH_DISTRIBUTE)
+        message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
+    endif()
+endif()
+
 include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
@@ -178,7 +197,7 @@ include(inference_lib)      # add paddle fluid inference libraries
 
 
 include_directories("${PADDLE_SOURCE_DIR}")
-include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
+include_directories("${PADDLE_SOURCE_DIR}/paddle/legacy/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
 
@@ -222,7 +241,7 @@ add_subdirectory(proto)
 if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
     # "add_subdirectory(go)" should be placed after the following loine,
     # because it depends on paddle/optimizer.
-    add_subdirectory(paddle/optimizer)
+    add_subdirectory(paddle/legacy/optimizer)
 endif()
 
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b1b02bcc2f4fd14297715bcf5bfd1617e3d5f0c9..b878f37a5b8e807e5aa346e0074a741f2f8b6cc5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -159,4 +159,4 @@ This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the
 - verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
 - verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
 - verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
-- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math)
diff --git a/README.md b/README.md
index 8d89c6b1ec9e4aefbd64328dedb4e8c7cc50c21b..63abca069a6629ac59739224ded9cd9f06207d0a 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,6 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
 [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
-[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index 99c9d79b068f5886012fd702d84d0666b9d197b5..a79f25ccc6ace1594f3f331633130eaace5e175b 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -125,6 +125,10 @@ def parse_args():
     parser.add_argument(
         '--use_inference_transpiler',
         action='store_true',
-        help='If set, uses inference transpiler to optimize the program.')
+        help='If set, use inference transpiler to optimize the program.')
+    parser.add_argument(
+        '--no_random',
+        action='store_true',
+        help='If set, keep the random seed and do not shuffle the data.')
     args = parser.parse_args()
     return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
old mode 100755
new mode 100644
index dcd4d9ea95d816029317a29055b5ca8273ac9f43..94ea7bd6aca7c9595037a2dacc5e36d4c77827e7
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -132,10 +132,6 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
     exe.run(startup_prog)
 
     # Use inference_transpiler to speedup
-    if args.use_inference_transpiler:
-        t = fluid.InferenceTranspiler()
-        t.transpile(infer_prog, place)
-
     if not args.use_reader_op:
         feed_var_list = [
             var for var in train_prog.global_block().vars.itervalues()
@@ -186,6 +182,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
         print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
         # evaluation
         if not args.no_test and batch_acc and not args.use_reader_op:
+            if args.use_inference_transpiler:
+                t = fluid.InferenceTranspiler()
+                t.transpile(infer_prog, place)
+
             pass_test_acc = test(exe, infer_prog, test_reader, feeder,
                                  batch_acc)
             print(", Test Accuracy: %f" % pass_test_acc)
@@ -316,6 +316,8 @@ def main():
     args = parse_args()
     print_arguments(args)
     print_paddle_envs()
+    if args.no_random:
+        fluid.default_startup_program().random_seed = 1
 
     # the unique trainer id, starting from 0, needed by trainer
     # only
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index 9ed1093c54a501cc93dbbf9c3651fe70914ce26b..d44a9c07d31cfae9d54ad5949b85c77e60eae258 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -197,12 +197,12 @@ def get_model(args):
     optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
 
     batched_train_reader = paddle.batch(
-        paddle.reader.shuffle(
+        train_reader if args.no_random else paddle.reader.shuffle(
             train_reader, buf_size=5120),
         batch_size=args.batch_size * args.gpus,
         drop_last=True)
     batched_test_reader = paddle.batch(
-        train_reader, batch_size=args.batch_size, drop_last=True)
+        test_reader, batch_size=args.batch_size, drop_last=True)
 
     return avg_cost, inference_program, optimizer, batched_train_reader,\
                    batched_test_reader, batch_acc
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index e3b9d94215a858c5c9a34e1b7e97540f1876801d..6ed51c648478efb9784d0c43b169c285e740e0f3 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -83,18 +83,20 @@ else()
   set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
 endif()
 
-find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
+if(WITH_SYSTEM_BLAS)
+  find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
         ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
-find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
+  find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
         ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
 
-if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER REFERENCE)
-  set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
-  set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
-  add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
-  message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+    set(CBLAS_FOUND ON)
+    set(CBLAS_PROVIDER REFERENCE)
+    set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
+    set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
+    add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
+    message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  endif()
 endif()
 
 if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 6a8b15a6b60a2e5635dc78fc877f0c8da9a2a998..e4af34d10ed92c501dd805addb62747c91c00978 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -174,3 +174,7 @@ endif(WITH_GOLANG)
 if(WITH_GRPC)
     add_definitions(-DPADDLE_WITH_GRPC)
 endif(WITH_GRPC)
+
+if(WITH_BRPC_RDMA)
+    add_definitions(-DPADDLE_WITH_BRPC_RDMA)
+endif(WITH_BRPC_RDMA)
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index 8e2c913b2caae0c4eeb844d2b51a8975e81c1592..30b227b6452abf44171a1a4e04569e66b16e67a4 100644
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -14,6 +14,15 @@
 
 INCLUDE(ExternalProject)
 
+find_library(SSL_LIBRARY NAMES ssl)
+ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY})
+
+find_library(CRYPTO_LIBRARY NAMES crypto)
+ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY})
+
+
 SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
 SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
 SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE)
@@ -22,14 +31,14 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
 INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
 
 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
-set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf")
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib")
 
 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
     extern_brpc
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/brpc/brpc"
-    GIT_TAG         "6d153dd7ff00f960ae6895c9c5fff0ce9f07aff2"
+    GIT_REPOSITORY  "https://github.com/gongweibao/brpc"
+    GIT_TAG         "7dc04defad1fd4173aae170c3fcbde131b65155a"
     PREFIX          ${BRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -42,6 +51,8 @@ ExternalProject_Add(
                     -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     -DCMAKE_PREFIX_PATH=${prefix_path}
                     -DBRPC_WITH_GLOG=ON
+                    -DIOBUF_WITH_HUGE_BLOCK=ON
+                    -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
                     ${EXTERNAL_OPTIONAL_ARGS}
     LIST_SEPARATOR |
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
@@ -49,7 +60,7 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-ADD_DEPENDENCIES(extern_brpc protobuf leveldb gflags glog gtest snappy)
+ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
 ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
 ADD_DEPENDENCIES(brpc extern_brpc)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 9c42044ec163e9db1dd21d5c3915b010c30fdf1c..fd7fc16bff5651f022b484623243048fbd225b5a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
     set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)
 
+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+# for building inference libs
+function(find_fluid_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(FIND "${__target_path}" "fluid" pos)
+  if(pos GREATER 1)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+  endif()
+endfunction(find_fluid_modules)
+
 function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})
   list(REMOVE_DUPLICATES libs)
@@ -250,6 +264,7 @@ function(cc_test TARGET_NAME)
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (${cc_test_SERIAL})
         set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     endif()
   endif()
 endfunction(cc_test)
@@ -314,6 +329,7 @@ function(nv_test TARGET_NAME)
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     endif()
   endif()
 endfunction(nv_test)
@@ -561,7 +577,7 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 850098297e1456487cb8a7b83dffd3d2b0478689..c6979713231f631f8757e4139d6f685d4554b54e 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -12,19 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set_property(GLOBAL PROPERTY FLUID_MODULES "")
-# find all fluid modules is used for paddle fluid static library
-function(find_fluid_modules TARGET_NAME)
-  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "fluid" pos)
-  if(pos GREATER 1)
-    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
-  endif()
-endfunction(find_fluid_modules)
-
 # make package for paddle fluid shared and static library
 function(copy TARGET)
     set(options "")
@@ -154,7 +141,7 @@ set(inference_deps paddle_fluid_shared paddle_fluid)
 if(WITH_CONTRIB)
     message(STATUS "installing contrib")
     set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
-    if (WITH_ANAKIN)
+    if (WITH_ANAKIN AND WITH_GPU)
         copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
             SRCS
             ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
@@ -163,9 +150,9 @@ if(WITH_CONTRIB)
         list(APPEND inference_deps contrib_anakin_inference_lib)
    endif()
 
-  copy(contrib_inference_lib DEPS paddle_inference_api
+  copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
         SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
-        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
+        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
         DSTS ${contrib_dst_dir} ${contrib_dst_dir})
   list(APPEND inference_deps contrib_inference_lib)
 endif()
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index 264506a68ae17d081dd58ef4794bf7723f6d021c..d443c49657b92583e527035f49e74462cf41487d 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1468,6 +1468,14 @@ argmax
 ..  autofunction:: paddle.fluid.layers.argmax
     :noindex:
 
+.. _api_fluid_layers_argsort:
+
+argsort
+-------
+
+..  autofunction:: paddle.fluid.layers.argsort
+    :noindex:
+
 .. _api_fluid_layers_ones:
 
 ones
diff --git a/doc/fluid/design/multi_devices/kernel_selection.md b/doc/fluid/design/multi_devices/kernel_selection.md
index 967317d5d2eeb818ab14faabca342cc8c4ed717e..4d2aab87b8cf30d03075e96cc4c67070efaf963a 100644
--- a/doc/fluid/design/multi_devices/kernel_selection.md
+++ b/doc/fluid/design/multi_devices/kernel_selection.md
@@ -74,10 +74,10 @@ void OperatorWithKernel::Run(
     auto kernel_type_for_var = this->GetKernelTypeForVar(...);
     if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
       auto* trans_var = new_scope.Var(var_name);
-      auto* out = DataTransform(expected_kernel_key,
+      auto* out = TransformData(expected_kernel_key,
                                 kernel_type_for_var,
                                 *tensor_in);
-      CopyVariableWithTensor(...);
+      SetTensorToVariable(...);
     }
   }
 
diff --git a/doc/v2/design/interface/00.why_plain_c.md b/doc/v2/design/interface/00.why_plain_c.md
index a1443093342c5a3ed698fb6b52a751dfc7cb5319..826ff3141bc2512b525cb44ac0f18b376ce57e92 100644
--- a/doc/v2/design/interface/00.why_plain_c.md
+++ b/doc/v2/design/interface/00.why_plain_c.md
@@ -65,7 +65,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix matrix,
 而在CPP里面实现这个C的接口，文件 `paddle_matrix.cpp`
 
 ```cpp
-#include "paddle/math/matrix.h"
+#include "paddle/legacy/math/matrix.h"
 extern "C"
 paddle_error paddle_matrix_shape(paddle_matrix matrix,
                                  uint64_t *width,
diff --git a/doc/v2/dev/new_layer_cn.rst b/doc/v2/dev/new_layer_cn.rst
index 3115654b2bd87995fa63bb7828fd1b3039aea8cc..e5a14346123d342de0b67757cbbce654bd4180dc 100644
--- a/doc/v2/dev/new_layer_cn.rst
+++ b/doc/v2/dev/new_layer_cn.rst
@@ -58,7 +58,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
 实现C++类
 ===================
 
-一个网络层的C++类需要实现初始化，前向和后向。全连接层的实现位于:code:`paddle/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。
+一个网络层的C++类需要实现初始化，前向和后向。全连接层的实现位于:code:`paddle/legacy/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/legacy/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。
 
 这个类需要继承 :code:`paddle::Layer` 这个基类，并且需要重写基类中的以下几个虚函数：
 
@@ -153,7 +153,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
 
 - 每个层在其 :code:`forward` 函数的开头必须调用 :code:`Layer::forward(passType);` 。
 - 之后使用 :code:`reserveOutput(batchSize, size);` 为输出分配内存。由于我们支持训练数据有不同的批次大小，所以这一步是必要的。 :code:`reserveOutput`  会相应地改变输出的尺寸。为了保证效率，如果需要扩大矩阵，我们会重新分配内存；如果需要缩减矩阵，我们会继续使用现有的内存块。
-- 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵，每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作，请参考 :code:`paddle/math/Matrix.h`和:code:`paddle/math/BaseMatrix.h` 。
+- 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵，每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作，请参考 :code:`paddle/legacy/math/Matrix.h`和:code:`paddle/legacy/math/BaseMatrix.h` 。
 - 最终，使用 :code:`forwardActivation();` 进行激活操作。这会自动进行网络配置中声明的激活操作。
 
 
@@ -262,7 +262,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
     REGISTER_LAYER(fc, FullyConnectedLayer);
     }
 
-若 :code:`cpp` 被放在 :code:`paddle/gserver/layers` 目录下，其会自动被加入编译列表。
+若 :code:`cpp` 被放在 :code:`paddle/legacy/gserver/layers` 目录下，其会自动被加入编译列表。
 
 
 写梯度检查单元测试
@@ -270,7 +270,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
 
 写梯度检查单元测试是一个验证新实现的层是否正确的相对简单的办法。梯度检查单元测试通过有限差分法来验证一个层的梯度。首先对输入做一个小的扰动 :math:`\Delta x` ，然后观察到输出的变化为 :math:`\Delta y` ，那么，梯度就可以通过这个方程计算得到 :math:`\frac{\Delta y}{\Delta x }` 。之后，再用这个梯度去和 :code:`backward` 函数得到的梯度去对比，以保证梯度计算的正确性。需要注意的是梯度检查仅仅验证了梯度的计算，并不保证 :code:`forward` 和 :code:`backward` 函数的实现是正确的。你需要一些更复杂的单元测试来保证你实现的网络层是正确的。
 
-所有网络层的梯度检查单测都位于 :code:`paddle/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步：
+所有网络层的梯度检查单测都位于 :code:`paddle/legacy/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步：
 
 + 生成网络层配置。网络层配置包含以下几项：
    - 偏置参数的大小。（例子中是4096）
@@ -322,7 +322,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
       }
     }
 
-如果你要为了测试而增加新的文件，例如 :code:`paddle/gserver/tests/testFCGrad.cpp` ，你需要把该文件加入 :code:`paddle/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时，所有的单测都会被执行一次。注意，有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。
+如果你要为了测试而增加新的文件，例如 :code:`paddle/legacy/gserver/tests/testFCGrad.cpp` ，你需要把该文件加入 :code:`paddle/legacy/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时，所有的单测都会被执行一次。注意，有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。
 
 .. code-block:: bash
 
diff --git a/doc/v2/dev/new_layer_en.rst b/doc/v2/dev/new_layer_en.rst
index b05bb45f11eb253dfb87d6283c29ec6689394d22..6a848a020df343c14601b9c3fcb5fb6fcde7f880 100644
--- a/doc/v2/dev/new_layer_en.rst
+++ b/doc/v2/dev/new_layer_en.rst
@@ -58,7 +58,7 @@ Finally we can use chain rule to calculate :math:`\frac{\partial z}{\partial x}`
 Implement C++ Class
 ===================
 
-The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below.
+The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/legacy/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/legacy/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below.
 
 It needs to derive the base class :code:`paddle::Layer`, and it needs to override the following functions:
 
@@ -154,7 +154,7 @@ The implementation of the forward part has the following steps.
 
 - Every layer must call :code:`Layer::forward(passType);` at the beginning of its :code:`forward` function.
 - Then it allocates memory for the output using :code:`reserveOutput(batchSize, size);`. This step is necessary because we support the batches to have different batch sizes. :code:`reserveOutput` will change the size of the output accordingly. For the sake of efficiency, we will allocate new memory if we want to expand the matrix, but we will reuse the existing memory block if we want to shrink the matrix.
-- Then it computes :math:`\sum_i W_i x + b` using Matrix operations. :code:`getInput(i).value` retrieve the matrix of the i-th input. Each input is a :math:`batchSize \times dim` matrix, where each row represents an single input in a batch. For a complete lists of supported matrix operations, please refer to :code:`paddle/math/Matrix.h` and :code:`paddle/math/BaseMatrix.h`.
+- Then it computes :math:`\sum_i W_i x + b` using Matrix operations. :code:`getInput(i).value` retrieve the matrix of the i-th input. Each input is a :math:`batchSize \times dim` matrix, where each row represents an single input in a batch. For a complete lists of supported matrix operations, please refer to :code:`paddle/legacy/math/Matrix.h` and :code:`paddle/legacy/math/BaseMatrix.h`.
 - Finally it applies the activation function using :code:`forwardActivation();`. It will automatically applies the corresponding activation function specifies in the network configuration.
 
 
@@ -263,7 +263,7 @@ Finally, you can use :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` to registe
     REGISTER_LAYER(fc, FullyConnectedLayer);
     }
 
-If the :code:`cpp` file is put into :code:`paddle/gserver/layers`, it will be automatically added to the compilation list.
+If the :code:`cpp` file is put into :code:`paddle/legacy/gserver/layers`, it will be automatically added to the compilation list.
 
 
 Write Gradient Check Unit Test
@@ -271,7 +271,7 @@ Write Gradient Check Unit Test
 
 An easy way to verify the correctness of new layer's implementation is to write a gradient check unit test. Gradient check unit test utilizes finite difference method to verify the gradient of a layer. It modifies the input with a small perturbation :math:`\Delta x` and observes the changes of output :math:`\Delta y`, the gradient can be computed as :math:`\frac{\Delta y}{\Delta x }`. This gradient can be compared with the gradient computed by the :code:`backward` function of the layer to ensure the correctness of the gradient computation. Notice that the gradient check only tests the correctness of the gradient computation, it does not necessarily guarantee the correctness of the implementation of the :code:`forward` and :code:`backward` function. You need to write more sophisticated unit tests to make sure your layer is implemented correctly.
 
-All the gradient check unit tests are located in :code:`paddle/gserver/tests/test_LayerGrad.cpp`. You are recommended to put your test into a new test file if you are planning to write a new layer. The gradient test of the gradient check unit test of the fully connected layer is listed below. It has the following steps.
+All the gradient check unit tests are located in :code:`paddle/legacy/gserver/tests/test_LayerGrad.cpp`. You are recommended to put your test into a new test file if you are planning to write a new layer. The gradient test of the gradient check unit test of the fully connected layer is listed below. It has the following steps.
 
 + Create layer configuration. A layer configuration can include the following attributes:
    - size of the bias parameter. (4096 in our example)
@@ -323,7 +323,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
       }
     }
 
-If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
+If you are creating a new file for the test, such as :code:`paddle/legacy/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/legacy/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
 
 .. code-block:: bash
 
diff --git a/doc/v2/faq/parameter/index_cn.rst b/doc/v2/faq/parameter/index_cn.rst
index 1fa4b3e1311d2007ccba98fde9ff94300ea42c16..987e8cf088be4ee8daa7c28fdc855506cbfd31c7 100644
--- a/doc/v2/faq/parameter/index_cn.rst
+++ b/doc/v2/faq/parameter/index_cn.rst
@@ -196,6 +196,6 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
         obj="process",
         args={"src_dict_path": src_dict_path})
 
-完整源码可参考 `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ 示例。
+完整源码可参考 `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_recurrent.py>`_ 示例。
 
 
diff --git a/doc/v2/howto/optimization/gpu_profiling_cn.rst b/doc/v2/howto/optimization/gpu_profiling_cn.rst
index 25bcaccb6975bc21fba2e8c5843da15c69948d72..f2396716bddd4810fa77c738d41f5482aa6d6055 100644
--- a/doc/v2/howto/optimization/gpu_profiling_cn.rst
+++ b/doc/v2/howto/optimization/gpu_profiling_cn.rst
@@ -50,12 +50,12 @@ GPU则还需要高并行性，才能发挥其全部能力。这正是它们速
 **nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
 在这个教程中，我们主要会介绍nvprof和nvvp。
 
-:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
 above profilers.
 
-:code:`paddle/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+:code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
 
-.. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
    :language: c++
    :lines: 137-151
    :linenos:
@@ -83,7 +83,7 @@ program crashes when CPU version of PaddlePaddle invokes them.
 
 1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
 
-    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
         :language: c++
         :lines: 137-151
         :emphasize-lines: 8-12,14
@@ -101,8 +101,8 @@ program crashes when CPU version of PaddlePaddle invokes them.
     .. code-block:: bash
         :emphasize-lines: 1,12-15
 
-        > ./paddle/math/tests/test_GpuProfiler
-        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        > ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
         I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
         I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
         [==========] Running 1 test from 1 test case.
@@ -130,7 +130,7 @@ nvprof 工具
 
 1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
 
-    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
         :language: c++
         :lines: 137-151
         :emphasize-lines: 6-7
@@ -147,13 +147,13 @@ nvprof 工具
 
     .. code-block:: bash
 
-        nvprof  ./paddle/math/tests/test_GpuProfiler
+        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
 
 然后，您就能获得如下的分析结果：
 
 .. code-block:: bash
 
-    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
     ==78544== Profiling result:
     Time(%)     Time     Calls       Avg       Min       Max  Name
     27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
diff --git a/doc/v2/howto/optimization/gpu_profiling_en.rst b/doc/v2/howto/optimization/gpu_profiling_en.rst
index 50adb7da24906515cb5977db565e9f8a76599fef..6e439be9bba8935cdd65f1c131cfd3725530ec0e 100644
--- a/doc/v2/howto/optimization/gpu_profiling_en.rst
+++ b/doc/v2/howto/optimization/gpu_profiling_en.rst
@@ -51,10 +51,10 @@ For general GPU profiling, a bunch of tools are provided from both NVIDIA and th
 **nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler.
 In this tutorial, we will focus on nvprof and nvvp.
 
-:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
 above profilers.
 
-.. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
    :language: c++
    :lines: 137-151
    :linenos:
@@ -80,7 +80,7 @@ As a simple example, consider the following:
 
 1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
 
-    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
         :language: c++
         :lines: 137-151
         :emphasize-lines: 8-12,14
@@ -98,8 +98,8 @@ As a simple example, consider the following:
     .. code-block:: bash
         :emphasize-lines: 1,12-15
 
-        > ./paddle/math/tests/test_GpuProfiler
-        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        > ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
         I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
         I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
         [==========] Running 1 test from 1 test case.
@@ -127,7 +127,7 @@ To use this command line profiler **nvprof**, you can simply issue the following
 
 1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines).
 
-    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
         :language: c++
         :lines: 137-151
         :emphasize-lines: 6-7
@@ -144,13 +144,13 @@ To use this command line profiler **nvprof**, you can simply issue the following
 
     .. code-block:: bash
 
-        nvprof  ./paddle/math/tests/test_GpuProfiler
+        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
 
 Then, you can get the following profiling result:
 
 .. code-block:: bash
 
-    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
     ==78544== Profiling result:
     Time(%)     Time     Calls       Avg       Min       Max  Name
     27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
index 67c7b774e9c476a3035037a421c84ebf17a31b09..9d6d417075485dceb1ee71f527b408aa6a6638ea 100644
--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
@@ -4,7 +4,7 @@
 单双层RNN API对比介绍
 #####################
 
-本文以PaddlePaddle的双层RNN单元测试为示例，用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型，来讲解如何使用双层RNN。本文中所有的例子，都只是介绍双层RNN的API接口，并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用，请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+本文以PaddlePaddle的双层RNN单元测试为示例，用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型，来讲解如何使用双层RNN。本文中所有的例子，都只是介绍双层RNN的API接口，并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用，请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
 
 示例1：双层RNN，子序列间无Memory
 ================================
@@ -13,8 +13,8 @@
 
 在本示例中，单层RNN和双层RNN的网络配置，都是将每一句分好词后的句子，使用LSTM作为encoder，压缩成一个向量。区别是RNN使用两层序列模型，将多句话看成一个整体同时使用encoder压缩。二者语意上完全一致。这组语义相同的示例配置如下：
 
-* 单层RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_layer_group.conf>`_
-* 双层RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_layer_group.conf>`_
+* 单层RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_layer_group.conf>`_
+* 双层RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf>`_
 
 
 读取双层序列数据
@@ -24,18 +24,18 @@
 
 - 本例中的原始数据一共有10个样本。每个样本由两部分组成，一个label（此处都为2）和一个已经分词后的句子。这个数据也被单层RNN网络直接使用。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
     :language: text
 
 
 - 双层序列数据一共有4个样本。 每个样本间用空行分开，整体数据和原始数据完全一样。但于双层序列的LSTM来说，第一个样本同时encode两条数据成两个向量。这四条数据同时处理的句子数量为\ :code:`[2, 3, 2, 3]`\ 。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
     :language: text
 
-其次，对于两种不同的输入数据类型，不同DataProvider对比如下(`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequenceGen.py>`_)\：
+其次，对于两种不同的输入数据类型，不同DataProvider对比如下(`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequenceGen.py>`_)\：
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
     :language: python
     :lines: 21-39
     :linenos:
@@ -47,7 +47,7 @@
     - words是原始数据中的每一句话，所对应的词表index数组。它是integer_value_sequence类型的，即整数数组。words即为这个数据中的单层时间序列。
     - label是原始数据中对于每一句话的分类标签，它是integer_value类型的。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
     :language: python
     :lines: 42-71
     :linenos:
@@ -64,7 +64,7 @@
 
 首先，我们看一下单层RNN的配置。代码中9-15行(高亮部分)即为单层RNN序列的使用代码。这里使用了PaddlePaddle预定义好的RNN处理函数。在这个函数中，RNN对于每一个时间步通过了一个LSTM网络。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_layer_group.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_layer_group.conf
     :language: python
     :lines: 38-63
     :linenos:
@@ -85,7 +85,7 @@
 
 * 至此，\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_layer_group.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
     :language: python
     :lines: 38-64
     :linenos:
@@ -107,7 +107,7 @@
 
 - 单层RNN：过了一个很简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn.conf
     :language: python
     :lines: 36-48
 
@@ -116,7 +116,7 @@
   - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem，表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中，outer_mem是一个子句的最后一个向量，即整个双层group是将前一个子句的最后一个向量，作为下一个子句memory的初始状态。
   - 从输入数据上看，单双层序列的句子是一样的，只是双层序列将其又做了子序列划分。因此双层序列的配置中，必须将前一个子句的最后一个元素，作为boot_layer传给下一个子句的memory，才能保证和单层序列的配置中“每个时间步都用了上一个时间步的输出结果”一致。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn.conf
     :language: python
     :lines: 39-66
 
@@ -134,7 +134,7 @@
 
 **输入不等长** 是指recurrent_group的多个输入序列，在每个时间步的子序列长度可以不相等。但序列输出时，需要指定与某一个输入的序列信息是一致的。使用\ :red:`targetInlink`\ 可以指定哪一个输入和输出序列信息一致，默认指定第一个输入。 
 
-示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ 。
+示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ 。
 
 示例3对于单层RNN和双层RNN数据完全相同。
 
@@ -152,14 +152,14 @@
 
 * 单层RNN\:
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
     :language: python
     :lines: 42-59
     :linenos:
 
 * 双层RNN\ \:
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
     :language: python
     :lines: 41-80
     :linenos:
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
index ae997f0805db5b01a34867c9e8b188c931721920..a4485f7b5edf21871444801230ab1ee191b1137b 100644
--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
@@ -4,7 +4,7 @@
 API comparision between RNN and hierarchical RNN
 #####################
 
-This article takes PaddlePaddle's hierarchical RNN unit test as an example. We will use several examples to illestrate the usage of single-layer and hierarchical RNNs. Each example has two model configurations, one for single-layer, and the other for hierarchical RNN. Although the implementations are different, both the two model configurations' effects are the same. All of the examples in this article only describe the API interface of the hierarchical RNN, while we do not use this hierarchical RNN to solve practical problems. If you want to understand the use of hierarchical RNN in specific issues, please refer to \ :ref:`algo_hrnn_demo`\ 。The unit test file used in this article's example is \ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+This article takes PaddlePaddle's hierarchical RNN unit test as an example. We will use several examples to illestrate the usage of single-layer and hierarchical RNNs. Each example has two model configurations, one for single-layer, and the other for hierarchical RNN. Although the implementations are different, both the two model configurations' effects are the same. All of the examples in this article only describe the API interface of the hierarchical RNN, while we do not use this hierarchical RNN to solve practical problems. If you want to understand the use of hierarchical RNN in specific issues, please refer to \ :ref:`algo_hrnn_demo`\ 。The unit test file used in this article's example is \ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
 
 Example 1：Hierarchical RNN without Memory between subsequences
 ================================
@@ -13,8 +13,8 @@ The classical case in the hierarchical RNN is to perform sequence operations on
 
 In this example, the network configuration of single-layer RNNs and hierarchical RNNs are all to use LSTM as en encoder to compress a word-segmented sentence into a vector. The difference is that, RNN uses a hierarchical RNN model, treating multiple sentences as a whole to use encoder to compress simultaneously. They are completely consistent in their semantic meanings. This pair of semantically identical example configurations is as follows：
 
-* RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_layer_group.conf>`_
-* Hierarchical RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_layer_group.conf>`_
+* RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_layer_group.conf>`_
+* Hierarchical RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf>`_
 
 
 Reading hierarchical sequence data
@@ -24,18 +24,18 @@ Firstly, the original data in this example is as follows \:
 
 - The original data in this example has 10 samples. Each of the sample includes two components: a lable(all 2 here), and a word-segmented sentence. This data is used by single RNN as well. 
 
-..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
     :language: text
 
 
 - The data for hierarchical RNN has 4 samples. Every sample is seperated by a blank line, while the content of the data is the same as the original data. But as for hierarchical LSTM, the first sample will encode two sentences into two vectors simultaneously. The sentence count dealed simultaneously by this 4 samples are \ :code:`[2, 3, 2, 3]`\ .
 
-..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
     :language: text
 
-Secondly, as for these two types of different input data formats, the contrast of different DataProviders are as follows (`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequenceGen.py>`_)\：
+Secondly, as for these two types of different input data formats, the contrast of different DataProviders are as follows (`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequenceGen.py>`_)\：
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
     :language: python
     :lines: 21-39
     :linenos:
@@ -47,7 +47,7 @@ Secondly, as for these two types of different input data formats, the contrast o
     - "words" is a list of word table indices corresponding to each word in the sentence in the original data. Its data type is integer_value_sequence, that is integer list. So, "words" is a singler-layer time series in the data. 
     - "label" is the categorical label of each sentence, whose data type is integer_value. 
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
     :language: python
     :lines: 42-71
     :linenos:
@@ -64,7 +64,7 @@ Model configuration
 
 Firstly, let's look at the configuration of single-layer RNN. The hightlighted part of line 9 to line 15 is the usage of single-layer RNN. Here we use the pre-defined RNN process function in PaddlePaddle. In this function, for each time step, RNN passes through an LSTM network. 
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_layer_group.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_layer_group.conf
     :language: python
     :lines: 38-63
     :linenos:
@@ -85,7 +85,7 @@ Secondly, let's look at the model configuration of hierarchical RNN which has th
 
 * Till now, \ :code:`lstm_last`\ has the same result as \ :code:`lstm_last`\ in single-layer RNN configuration. 
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_layer_group.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
     :language: python
     :lines: 38-64
     :linenos:
@@ -107,7 +107,7 @@ We select the different parts between single-layer RNN and hierarchical RNN conf
 
 - single-layer RNN：passes through a simple recurrent_group. For each time step, the current input y and the last time step's output rnn_state pass through a fully-connected layer. 
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn.conf
     :language: python
     :lines: 36-48
 
@@ -116,7 +116,7 @@ We select the different parts between single-layer RNN and hierarchical RNN conf
   - The recurrent_group of inner layer's inner_step is nearly the same as single-layer sequence, except for the case of boot_layer=outer_mem, which means using the outer layer's outer_mem as the initial state for the inner layer's memory. In the outer layer's out_step, outer_mem is the last vector of a subsequence, that is, the whole hierarchical group uses the last vector of the previous subsequence as the initial state for the next subsequence's memory. 
   - From the aspect of the input data, sentences from single-layer and hierarchical RNN are the same. The only difference is that, hierarchical RNN disassembes the sequence into subsequences. So in the hierarchical RNN configuration, we must use the last element of the previous subsequence as a boot_layer for the memory of the next subsequence, so that it makes no difference with "every time step uses the output of last time step" in the sigle-layer RNN configuration. 
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn.conf
     :language: python
     :lines: 39-66
 
@@ -134,7 +134,7 @@ Example 3：hierarchical RNN with unequal length inputs
 
 **unequal length inputs** means in the multiple input sequences of recurrent_group, the lengths of subsequences can be unequal. But the output of the sequence, needs to be consistent with one of the input sequences. Using \ :red:`targetInlink`\ can help you specify which of the input sequences and the output sequence can be consistent, by default is the first input. 
 
-The configurations of Example 3 are \ `sequence_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_ \ and \ `sequence_nest_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ . 
+The configurations of Example 3 are \ `sequence_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_ \ and \ `sequence_nest_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ .
 
 The data for the configurations of Example 3's single-layer RNN and hierarchical RNN are exactly the same. 
 
@@ -152,14 +152,14 @@ Similar to Example 2's configuration, Example 3's configuration uses single-laye
 
 * single-layer RNN\:
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
     :language: python
     :lines: 42-59
     :linenos:
 
 * hierarchical RNN\ \:
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
     :language: python
     :lines: 41-80
     :linenos:
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index f17577997bc94b08f3e296c4d6e35682ca3c0e57..eba0c47e195a80fc298f0fdd78c8d6345e963be8 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -16,7 +16,7 @@ package pserver
 
 // #cgo CFLAGS: -I ../../
 // #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
-// #include "paddle/optimizer/optimizer.h"
+// #include "paddle/legacy/optimizer/optimizer.h"
 // #include <stdlib.h>
 // #include <string.h>
 import "C"
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index d722eec1892206ac44c49e7a12d92be0c54df8c0..7a4bd9183a6dce606d595044852555b04f0e06b2 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,24 +1,24 @@
 if(NOT WITH_FLUID_ONLY)
-  add_subdirectory(cuda)
-  add_subdirectory(function)
+  add_subdirectory(legacy/cuda)
+  add_subdirectory(legacy/function)
   add_subdirectory(utils)
-  add_subdirectory(math)
-  add_subdirectory(gserver)
-  add_subdirectory(parameter)
+  add_subdirectory(legacy/math)
+  add_subdirectory(legacy/gserver)
+  add_subdirectory(legacy/parameter)
 
   if(MOBILE_INFERENCE)
-    add_subdirectory(capi)
+    add_subdirectory(legacy/capi)
   else()
-    add_subdirectory(pserver)
+    add_subdirectory(legacy/pserver)
     add_subdirectory(trainer)
     add_subdirectory(scripts)
 
     if(WITH_C_API)
-      add_subdirectory(capi)
+      add_subdirectory(legacy/capi)
     endif()
 
     if(WITH_SWIG_PY)
-      add_subdirectory(api)
+      add_subdirectory(legacy/api)
     endif()
   endif()
 endif()
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
deleted file mode 100644
index 62d6a574d55d2748635879a21cbbaa474f070cff..0000000000000000000000000000000000000000
--- a/paddle/api/Arguments.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include "paddle/parameter/Argument.h"
-
-size_t Arguments::getSlotNum() const { return m->outputs.size(); }
-
-Arguments* Arguments::createArguments(size_t slotNum) {
-  auto args = new Arguments();
-  args->m->outputs.resize(slotNum);
-  return args;
-}
-
-void Arguments::resize(size_t slotNum) { m->outputs.resize(slotNum); }
-
-Arguments::Arguments() : m(new ArgumentsPrivate()) {}
-
-Arguments::~Arguments() { delete m; }
-
-Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
-  auto p = (std::vector<paddle::Argument>*)(ptr);
-  auto args = new Arguments();
-  args->m->outputs = *p;
-  return args;
-}
-
-Arguments* Arguments::createByPaddleArgument(const void* ptr) {
-  auto p = (paddle::Argument*)(ptr);
-  auto args = new Arguments();
-  args->m->outputs.push_back(*p);
-  return args;
-}
-
-Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.value);
-}
-
-Matrix* Arguments::getSlotGrad(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.grad);
-}
-
-IVector* Arguments::getSlotIds(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return IVector::createByPaddleVectorPtr(&a.ids);
-}
-
-Matrix* Arguments::getSlotIn(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.in);
-}
-
-void Arguments::setSlotValue(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.value = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotGrad(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.grad = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotIn(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.in = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotIds(size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.ids = v;
-}
-
-template <typename T1>
-static inline void doCopyFromSafely(std::shared_ptr<T1>& dest,
-                                    std::shared_ptr<T1>& src) {
-  if (src) {
-    if (dest) {
-      dest->copyFrom(*src);
-    } else {
-      dest = src;
-    }
-  }
-}
-
-IVector* Arguments::getSlotSequenceStartPositions(size_t idx) const
-    throw(RangeError) {
-  auto& a = m->getArg(idx);
-  if (a.sequenceStartPositions) {
-    return IVector::createByPaddleVectorPtr(
-        &a.sequenceStartPositions->getMutableVector(false));
-  } else {
-    return nullptr;
-  }
-}
-
-IVector* Arguments::getSlotSubSequenceStartPositions(size_t idx) const
-    throw(RangeError) {
-  auto& a = m->getArg(idx);
-  if (a.subSequenceStartPositions) {
-    return IVector::createByPaddleVectorPtr(
-        &a.subSequenceStartPositions->getMutableVector(false));
-  } else {
-    return nullptr;
-  }
-}
-
-void Arguments::setSlotSequenceStartPositions(size_t idx,
-                                              IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.sequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
-}
-
-void Arguments::setSlotSubSequenceStartPositions(
-    size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.subSequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
-}
-
-IVector* Arguments::getSlotSequenceDim(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return IVector::createByPaddleVectorPtr(&a.cpuSequenceDims);
-}
-
-void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
-}
-
-float Arguments::sum() const { return paddle::Argument::sum(m->outputs); }
-
-int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getBatchSize();
-}
-
-void Arguments::setSlotFrameHeight(size_t idx, size_t h) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.setFrameHeight(h);
-}
-
-void Arguments::setSlotFrameWidth(size_t idx, size_t w) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.setFrameWidth(w);
-}
-
-size_t Arguments::getSlotFrameHeight(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getFrameHeight();
-}
-
-size_t Arguments::getSlotFrameWidth(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getFrameWidth();
-}
-
-void* Arguments::getInternalArgumentsPtr() const { return &m->outputs; }
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
deleted file mode 100644
index 0d9ad30de9c1f3f8f58c856a748abdc050ff8740..0000000000000000000000000000000000000000
--- a/paddle/api/GradientMachine.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include "Internal.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-
-std::vector<int> GradientMachine::defaultParamTypes = {
-    PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
-
-GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
-
-GradientMachine::~GradientMachine() { delete m; }
-
-GradientMachine* GradientMachine::createFromPaddleModelPtr(
-    const void* confPtr,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  auto& conf = *(const paddle::ModelConfig*)(confPtr);
-  std::vector<ParameterType> realTypes;
-  staticCastVector(&realTypes, types);
-  auto machineRawPtr = paddle::GradientMachine::create(conf, mode, realTypes);
-  auto machinePtr = std::shared_ptr<paddle::GradientMachine>(machineRawPtr);
-  if (machinePtr != nullptr) {
-    auto machine = new GradientMachine();
-    machine->m->machine = machinePtr;
-    return machine;
-  } else {
-    return nullptr;
-  }
-}
-
-GradientMachine* GradientMachine::createByConfigProtoStr(
-    const std::string& protoStr,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  paddle::ModelConfig conf;
-  conf.ParseFromString(protoStr);
-  if (conf.IsInitialized()) {
-    return GradientMachine::createFromPaddleModelPtr(&conf, mode, types);
-  } else {
-    return nullptr;
-  }
-}
-
-GradientMachine* GradientMachine::createByModelConfig(
-    ModelConfig* conf,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  auto confPtr = &conf->m->conf->getModelConfig();
-  return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
-}
-
-void GradientMachine::start() { m->machine->start(); }
-
-void GradientMachine::finish() { m->machine->finish(); }
-
-void GradientMachine::onPassEnd() { m->machine->onPassEnd(); }
-
-void GradientMachine::prefetch(const Arguments& inArgs) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  m->machine->prefetch(in);
-}
-
-void GradientMachine::forward(const Arguments& inArgs,
-                              Arguments* outArgs,
-                              PassType passType) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  auto& out = m->cast<std::vector<paddle::Argument>>(
-      outArgs->getInternalArgumentsPtr());
-  paddle::PassType pt = (paddle::PassType)(passType);
-  m->machine->forward(in, &out, pt);
-}
-
-UpdateCallback::~UpdateCallback() {}
-
-void UpdateCallback::apply(Parameter* p) {
-  // UNUSED(p);
-}
-
-class UpdateCallbackWrapper {
- public:
-  explicit UpdateCallbackWrapper(const UpdateCallback& callback)
-      : callback(const_cast<UpdateCallback&>(callback)) {}
-
-  void operator()(paddle::Parameter* param) {
-    auto p = Parameter::createFromRawPtr(&param);
-    // @TODO Use Stack variable instead.
-    callback.apply(p);
-    delete p;
-  }
-
- private:
-  UpdateCallback& callback;
-};
-
-void GradientMachine::backward(const UpdateCallback& callback) {
-  m->machine->backward(UpdateCallbackWrapper(callback));
-}
-
-void GradientMachine::forwardBackward(const Arguments& inArgs,
-                                      Arguments* outArgs,
-                                      PassType passType,
-                                      const UpdateCallback& callback) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  auto& out = m->cast<std::vector<paddle::Argument>>(
-      outArgs->getInternalArgumentsPtr());
-  paddle::PassType pt = (paddle::PassType)(passType);
-  m->machine->forwardBackward(in, &out, pt, UpdateCallbackWrapper(callback));
-}
-
-void GradientMachine::loadParameters(const std::string& path) {
-  m->machine->loadParameters(path);
-}
-
-size_t GradientMachine::getParameterSize() const {
-  return m->machine->getParameters().size();
-}
-
-Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
-  auto params = m->machine->getParameters();
-  if (i < params.size()) {
-    return Parameter::createFromSharedPtr(&m->machine->getParameters()[i]);
-  } else {
-    throw RangeError();
-  }
-}
-
-size_t GradientMachine::getNonStaticParameterSize() const {
-  return m->machine->getNonStaticParameters().size();
-}
-
-Parameter* GradientMachine::getNonStaticParameter(size_t i) throw(RangeError) {
-  auto params = m->machine->getNonStaticParameters();
-  if (i < params.size()) {
-    return Parameter::createFromSharedPtr(
-        &m->machine->getNonStaticParameters()[i]);
-  } else {
-    throw RangeError();
-  }
-}
-
-void GradientMachine::randParameters() { m->machine->randParameters(); }
-
-Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const
-    throw(UnsupportError) {
-  auto nn = m->machine;
-  if (nn) {
-    auto arg = nn->getLayerOutput(layerName);
-    return Arguments::createByPaddleArgument(&arg);
-  } else {
-    throw UnsupportError();
-  }
-}
-
-SequenceGenerator* GradientMachine::asSequenceGenerator(
-    const std::vector<std::string>& dict,
-    size_t begin_id,
-    size_t end_id,
-    size_t max_length,
-    size_t beam_size) {
-  SequenceGenerator* r =
-      SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
-  r->setDict(dict);
-  r->setBos(begin_id);
-  r->setEos(end_id);
-  r->setMaxLength(max_length);
-  r->setBeamSize(beam_size);
-  return r;
-}
-
-Evaluator* GradientMachine::makeEvaluator() {
-  auto ev = new Evaluator();
-  ev->m->rawPtr = m->machine->makeEvaluator();
-  return ev;
-}
-
-void GradientMachine::eval(Evaluator* evaluator) {
-  m->machine->eval(evaluator->m->rawPtr);
-}
diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
deleted file mode 100644
index 8282b4629dc08a7fcd9b52cbc3492ac10d8ed55c..0000000000000000000000000000000000000000
--- a/paddle/api/Matrix.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/math/Matrix.h"
-#include <cstring>
-#include <iostream>
-#include "PaddleAPI.h"
-#include "paddle/math/CpuSparseMatrix.h"
-#include "paddle/math/SparseMatrix.h"
-
-struct MatrixPrivate {
-  std::shared_ptr<paddle::Matrix> mat;
-};
-
-Matrix::Matrix() : m(new MatrixPrivate()) {}
-
-Matrix* Matrix::createByPaddleMatrixPtr(void* sharedPtr) {
-  auto* mat = reinterpret_cast<paddle::MatrixPtr*>(sharedPtr);
-  if ((*mat) != nullptr) {
-    auto m = new Matrix();
-    m->m->mat = *mat;
-    return m;
-  } else {
-    return nullptr;
-  }
-}
-
-Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(height, width, useGpu);
-  m->m->mat->zero();
-  return m;
-}
-
-Matrix* Matrix::createDense(const std::vector<float>& data,
-                            size_t height,
-                            size_t width,
-                            bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(height, width, useGpu);
-  m->m->mat->copyFrom(data.data(), data.size());
-  return m;
-}
-
-Matrix* Matrix::createDenseFromNumpy(float* data,
-                                     int dim1,
-                                     int dim2,
-                                     bool copy,
-                                     bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// Gpu mode only supports copy=True
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return Matrix::createGpuDenseFromNumpy(data, dim1, dim2);
-  } else {
-    return Matrix::createCpuDenseFromNumpy(data, dim1, dim2, copy);
-  }
-}
-
-Matrix* Matrix::createCpuDenseFromNumpy(float* data,
-                                        int dim1,
-                                        int dim2,
-                                        bool copy) {
-  auto m = new Matrix();
-  if (copy) {
-    m->m->mat = paddle::Matrix::create(dim1, dim2);
-    m->m->mat->copyFrom(data, dim1 * dim2);
-  } else {
-    m->m->mat = paddle::Matrix::create(data, dim1, dim2, false);
-  }
-  return m;
-}
-
-Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(dim1, dim2, false, true);
-  m->m->mat->copyFrom(data, dim1 * dim2);
-  return m;
-}
-
-Matrix* Matrix::createSparse(size_t height,
-                             size_t width,
-                             size_t nnz,
-                             bool isNonVal,
-                             bool isTrans,
-                             bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::createSparseMatrix(
-      height,
-      width,
-      nnz,
-      isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
-      isTrans,
-      useGpu);
-  return m;
-}
-
-Matrix::~Matrix() { delete m; }
-
-size_t Matrix::getHeight() const { return m->mat->getHeight(); }
-
-size_t Matrix::getWidth() const { return m->mat->getWidth(); }
-
-float Matrix::get(size_t x, size_t y) const throw(RangeError) {
-  if (x > this->getWidth() || y > this->getHeight()) {
-    RangeError e;
-    throw e;
-  }
-  return m->mat->getElement(x, y);
-}
-
-void Matrix::set(size_t x, size_t y, float val) throw(RangeError,
-                                                      UnsupportError) {
-  if (x > this->getWidth() || y > this->getHeight()) {
-    RangeError e;
-    throw e;
-  }
-  auto rawMat = m->mat.get();
-  if (auto cDenseMat = dynamic_cast<paddle::CpuMatrix*>(rawMat)) {
-    *(cDenseMat->getData() + x + y * cDenseMat->getWidth()) = val;
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-bool Matrix::isSparse() const {
-  auto raw_mat = m->mat.get();
-  return dynamic_cast<paddle::CpuSparseMatrix*>(raw_mat) != nullptr ||
-         dynamic_cast<paddle::GpuSparseMatrix*>(raw_mat) != nullptr;
-}
-
-SparseValueType Matrix::getSparseValueType() const throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    return (SparseValueType)cpuSparseMat->getValueType();
-  } else {
-    auto gpuSparseMat =
-        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
-    if (gpuSparseMat != nullptr) {
-      return (SparseValueType)gpuSparseMat->getValueType();
-    } else {
-      UnsupportError e;
-      throw e;
-    }
-  }
-}
-
-SparseFormatType Matrix::getSparseFormat() const throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    return (SparseFormatType)cpuSparseMat->getFormat();
-  } else {
-    auto gpuSparseMat =
-        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
-    if (gpuSparseMat != nullptr) {
-      return SPARSE_CSR;
-    } else {
-      UnsupportError e;
-      throw e;
-    }
-  }
-}
-
-IntArray Matrix::getSparseRowCols(size_t i) const
-    throw(UnsupportError, RangeError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr &&
-      cpuSparseMat->getFormat() == paddle::SPARSE_CSR) {
-    if (i < cpuSparseMat->getHeight()) {
-      // cpuSparseMat->print(std::cout);
-      size_t len = cpuSparseMat->getColNum(i);
-      return IntArray(cpuSparseMat->getRowCols(i), len);
-    } else {
-      RangeError e;
-      throw e;
-    }
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-IntWithFloatArray Matrix::getSparseRowColsVal(size_t i) const
-    throw(UnsupportError, RangeError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr &&
-      cpuSparseMat->getValueType() == paddle::FLOAT_VALUE) {
-    if (i < cpuSparseMat->getHeight()) {
-      return IntWithFloatArray(cpuSparseMat->getRowValues(i),
-                               cpuSparseMat->getRowCols(i),
-                               cpuSparseMat->getColNum(i));
-    } else {
-      RangeError e;
-      throw e;
-    }
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-FloatArray Matrix::getData() const {
-  auto rawMat = m->mat.get();
-  if (dynamic_cast<paddle::GpuMemoryHandle*>(rawMat->getMemoryHandle().get())) {
-    // is gpu. then copy data
-    float* data = rawMat->getData();
-    size_t len = rawMat->getElementCnt();
-    float* cpuData = new float[len];
-    hl_memcpy_device2host(cpuData, data, len * sizeof(float));
-    FloatArray ret_val(cpuData, len);
-    ret_val.needFree = true;
-    return ret_val;
-  } else {
-    FloatArray ret_val(rawMat->getData(), rawMat->getElementCnt());
-    return ret_val;
-  }
-}
-
-void Matrix::sparseCopyFrom(
-    const std::vector<int>& rows,
-    const std::vector<int>& cols,
-    const std::vector<float>& vals) throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    // LOG(INFO) <<"RowSize = "<<rows.size()
-    //  <<" ColSize = "<<cols.size()
-    //  <<" ValSize = "<<vals.size();
-    cpuSparseMat->copyFrom(const_cast<std::vector<int>&>(rows),
-                           const_cast<std::vector<int>&>(cols),
-                           const_cast<std::vector<float>&>(vals));
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-void* Matrix::getSharedPtr() const { return &m->mat; }
-
-void Matrix::toNumpyMatInplace(float** view_data,
-                               int* dim1,
-                               int* dim2) throw(UnsupportError) {
-  auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
-  if (cpuMat) {
-    *dim1 = cpuMat->getHeight();
-    *dim2 = cpuMat->getWidth();
-    *view_data = cpuMat->getData();
-  } else {
-    throw UnsupportError();
-  }
-}
-void Matrix::copyToNumpyMat(float** view_m_data,
-                            int* dim1,
-                            int* dim2) throw(UnsupportError) {
-  static_assert(sizeof(paddle::real) == sizeof(float),
-                "Currently PaddleAPI only support for single "
-                "precision version of paddle.");
-  if (this->isSparse()) {
-    throw UnsupportError();
-  } else {
-    *dim1 = m->mat->getHeight();
-    *dim2 = m->mat->getWidth();
-    *view_m_data = new float[(*dim1) * (*dim2)];
-    if (auto cpuMat = dynamic_cast<paddle::CpuMatrix*>(m->mat.get())) {
-      auto src = cpuMat->getData();
-      auto dest = *view_m_data;
-      std::memcpy(dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
-    } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
-      auto src = gpuMat->getData();
-      auto dest = *view_m_data;
-      hl_memcpy_device2host(
-          dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
-    } else {
-      LOG(WARNING) << "Unexpected Situation";
-      throw UnsupportError();
-    }
-  }
-}
-
-void Matrix::copyFromNumpyMat(float* data,
-                              int dim1,
-                              int dim2) throw(UnsupportError, RangeError) {
-  if (isSparse()) {
-    throw UnsupportError();
-  } else {
-    if (this->getHeight() == (size_t)dim1 && this->getWidth() == (size_t)dim2) {
-      if (m->mat->getData() != data) {
-        m->mat->copyFrom(data, dim1 * dim2);
-      }
-    } else {
-      throw RangeError();
-    }
-  }
-}
-
-bool Matrix::isGpu() const {
-  auto rawPtr = m->mat.get();
-  return dynamic_cast<paddle::GpuMatrix*>(rawPtr) != nullptr ||
-         dynamic_cast<paddle::GpuSparseMatrix*>(rawPtr) != nullptr;
-}
diff --git a/paddle/api/Paddle.i b/paddle/api/Paddle.i
deleted file mode 100644
index 3237e73745dca58bed923b20851f0f0039a3487c..0000000000000000000000000000000000000000
--- a/paddle/api/Paddle.i
+++ /dev/null
@@ -1,202 +0,0 @@
-%module(directors="1") swig_paddle
-%include "std_string.i"
-%{
-#define SWIG_FILE_WITH_INIT
-#include "api/PaddleAPI.h"   
-%}
-
-%include "exception.i"
-%typemap(throws) UnsupportError %{
-  SWIG_exception(SWIG_RuntimeError, $1.what());
-  SWIG_fail;
-%}
-
-%include "std_vector.i"
-%include "std_pair.i"
-#ifdef SWIGPYTHON
-%include "numpy.i"
-#endif
-
-%init %{
-#ifdef SWIGPYTHON
-import_array();
-#endif
-%}
-
-
-namespace std {
-%template(vector_int) vector<int>;
-%template(vector_uint) vector<unsigned int>;
-%template(vector_float) vector<float>;
-%template(vector_string) vector<string>;
-%template(vector_vec_star) vector<Vector*>;
-}
-#ifdef SWIGPYTHON 
-%typemap(in) (int argc, char** argv) { 
-    int i = 0; 
-    if (!PyList_Check($input)) { 
-        PyErr_SetString(PyExc_ValueError, "Expecting a list"); 
-        return NULL; 
-    } 
-    $1 = PyList_Size($input); 
-    $2 = (char **) malloc(($1+1)*sizeof(char *)); 
-    for (i = 0; i < $1; i++) { 
-        PyObject *s = PyList_GetItem($input,i); 
-        if (!PyString_Check(s)) { 
-            free($2); 
-            PyErr_SetString(PyExc_ValueError, "List items must be strings"); 
-            return NULL; 
-        } 
-        $2[i] = PyString_AsString(s); 
-    } 
-    $2[i] = 0; 
-} 
-%typemap(freearg) (int argc, char** argv) { 
-    if ($2) free($2); 
-} 
-
-%typemap(out) FloatArray {
-  $result = PyList_New($1.length);
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyFloat_FromDouble($1.buf[i]));
-  }  
-  if($1.needFree) {
-    delete [] $1.buf;  
-  }
-}
-
-%typemap(out) IntArray {
-  $result = PyList_New($1.length);  
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyInt_FromLong($1.buf[i]));  
-  }
-  if ($1.needFree) {
-    delete [] $1.buf;  
-  }
-}
-
-%typemap(out) IntWithFloatArray {
-  $result = PyList_New($1.length);
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyTuple_Pack(2, 
-      PyInt_FromLong($1.idxBuf[i]),
-      PyFloat_FromDouble($1.valBuf[i])
-    ));
-  }
-  if ($1.needFree) {
-    delete [] $1.idxBuf;
-    delete [] $1.valBuf;
-  } 
-}
-
-
-%rename(__getitem__) IVector::get;
-%rename(__setitem__) IVector::set;
-%rename(__len__) IVector::getSize;
-%rename(__getitem__) Vector::get;
-%rename(__setitem__) Vector::set;
-%rename(__len__) Vector::getSize;
-%rename(__len__) Parameter::getSize;
-%rename(__call__) ParameterTraverseCallback::apply;
-%rename(__repr__) Evaluator::toString;
-
-%apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) { 
-  (float* data, int dim1, int dim2) 
-}
-
-%apply (float** ARGOUTVIEW_ARRAY2, int* DIM1, int* DIM2) { 
-  (float** view_data, int* dim1, int* dim2) 
-}
-
-%apply (float** ARGOUTVIEWM_ARRAY2, int* DIM1, int* DIM2) {
-  (float** view_m_data, int* dim1, int* dim2)  
-}
-
-%apply (int** ARGOUTVIEWM_ARRAY1, int* DIM1) {
-  (int** view_m_data, int* dim1)  
-}
-
-%apply (int* INPLACE_ARRAY1, int DIM1) { 
-  (int* data, int dim) 
-}
-
-%apply (int** ARGOUTVIEW_ARRAY1, int* DIM1) {
-  (int** view_data, int* dim1)  
-}
-
-%apply (float* INPLACE_ARRAY1, int DIM1) {
-  (float* data, int dim)
-}
-
-%apply (float** ARGOUTVIEW_ARRAY1, int* DIM1) {
-  (float** view_data, int* dim1)
-}
-
-%apply (float** ARGOUTVIEWM_ARRAY1, int* DIM1) {
-  (float** view_m_data, int* dim1)
-}
-
-#endif
-// The below functions internally create object by "new", so it should use
-// use SWIG to handle gc. There are hints for SWIG to handle GC.
-%newobject Matrix::createZero;
-%newobject Matrix::createSparse;
-%newobject Matrix::createDense;
-%newobject Matrix::createDenseFromNumpy;
-%newobject Matrix::createCpuDenseFromNumpy;
-%newobject Matrix::createGpuDenseFromNumpy;
-%newobject Vector::createZero;
-%newobject Vector::create;
-%newobject Vector::createVectorFromNumpy;
-%newobject Vector::createCpuVectorFromNumpy;
-%newobject Vector::createGpuVectorFromNumpy;
-%newobject IVector::createZero;
-%newobject IVector::create;
-%newobject IVector::createVectorFromNumpy;
-%newobject IVector::createCpuVectorFromNumpy;
-%newobject IVector::createGpuVectorFromNumpy;
-%newobject Trainer::createByCommandLine;
-%newobject Trainer::getForwardOutput;
-%newobject Trainer::getLayerOutput;
-%newobject Arguments::getSlotValue;
-%newobject Arguments::getSlotIds;
-%newobject Arguments::getSlotIn;
-%newobject Arguments::getSlotSequenceStartPositions;
-%newobject Arguments::getSlotSequenceDim;
-%newobject Arguments::createArguments;
-%newobject GradientMachine::createByConfigProtoStr;
-%newobject GradientMachine::createByModelConfig;
-%newobject GradientMachine::asSequenceGenerator;
-%newobject GradientMachine::getParameter;
-%newobject GradientMachine::getLayerOutput;
-%newobject GradientMachine::makeEvaluator;
-%newobject TrainerConfig::createFromTrainerConfigFile;
-%newobject TrainerConfig::getModelConfig;
-%newobject TrainerConfig::getOptimizationConfig;
-%newobject Parameter::getBuf;
-%newobject Parameter::getConfig;
-%newobject ParameterOptimizer::create;
-%newobject ParameterOptimizer::needSpecialTraversal;
-%newobject ParameterUpdater::createLocalUpdater;
-%newobject ParameterUpdater::createRemoteUpdater;
-%newobject ParameterUpdater::createNewRemoteUpdater;
-
-%feature("director") UpdateCallback;
-%feature("autodoc", 1); // To generate method stub, for code hint in ide
-
-// Ignore many private class, and method cannot be handled by swig.
-%ignore MatrixPrivate;
-%ignore TrainerPrivate;
-%ignore IVector::operator[];
-%ignore ArgumentsPrivate;
-%ignore GradientMachinePrivate;
-%ignore TrainerConfigPrivate;
-%ignore ModelConfigPrivate;
-%ignore ParameterPrivate;
-%ignore SequenceGeneratorPrivate;
-%ignore VectorPrivate;
-%ignore ParameterConfigPrivate;
-%ignore OptimizationConfigPrivate;
-%ignore ParameterTraverseCallbackPrivate;
-%include "utils/GlobalConstants.h"
-%include "api/PaddleAPI.h"
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
deleted file mode 100644
index 7866122006a996cbe5201c661cab9c81aa82a219..0000000000000000000000000000000000000000
--- a/paddle/api/PaddleAPI.h
+++ /dev/null
@@ -1,1054 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdexcept>
-#include <string>
-#include <vector>
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/GlobalConstants.h"
-
-/// Import PaddlePaddle's enumeration into global namespace.
-using namespace paddle::enumeration_wrapper;  // NOLINT
-
-/**
- * @brief Initialize paddle.
- *
- * In python, this method should be invoked as
- * @code
- *  import sys
- *  import paddle
- *  paddle.initPaddle(sys.argv)
- *  or you can change arguments as any list of str.
- * @endcode
- */
-void initPaddle(int argc, char** argv);
-
-/// Return FLAGS_use_gpu
-bool isUsingGpu();
-
-/// Set the Flags_use_gpu to the given parameter
-void setUseGpu(bool useGpu);
-
-/// Return true if this py_paddle is compiled in GPU Version
-bool isGpuVersion();
-
-/// Return FLAGS_trainer_count
-int getTrainerCount();
-
-/// The Error of IO Operation. Such as file not found, etc.
-class IOError {};
-
-/// Out of range error
-class RangeError {};
-
-/// Not support Error, such as access GPU memory directly, etc.
-class UnsupportError : public std::runtime_error {
- public:
-  UnsupportError() : std::runtime_error(" ") {}
-  explicit UnsupportError(const std::string& message)
-      : std::runtime_error(message) {}
-};
-
-/// This type will map to python's list of float.
-struct FloatArray {
-  const float* buf;
-  const size_t length;
-  bool needFree;  // true if the buf is dynamic alloced.
-  FloatArray(const float* b, const size_t l);
-};
-
-/// This type will map to python's list of int
-struct IntArray {
-  const int* buf;
-  const size_t length;
-  bool needFree;
-  IntArray(const int* b, const size_t l, bool f = false);
-};
-
-/// This type will map to python's list of (int, float)
-struct IntWithFloatArray {
-  const float* valBuf;
-  const int* idxBuf;
-  const size_t length;
-  bool needFree;
-  IntWithFloatArray(const float* v, const int* i, size_t l, bool f = false);
-};
-
-enum SparseValueType { SPARSE_NON_VALUE = 0, SPARSE_VALUE = 1 };
-
-enum SparseFormatType { SPARSE_CSR = 0, SPARSE_CSC = 1 };
-
-/**
- * In Python, -1UL is hard to write. So define a const value used by python
- * side.
- */
-const size_t NO_SPARSE_ID = -1UL;
-
-struct MatrixPrivate;
-class Matrix {
-  Matrix();  // User Cannot Create Matrix.
-  DISABLE_COPY(Matrix);
-  static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
-
- public:
-  virtual ~Matrix();
-
-  /**
-   * Create A Matrix with height,width, which is filled by zero.
-   */
-  static Matrix* createZero(size_t height,
-                            size_t width,
-                            bool useGpu = isUsingGpu());
-
-  /**
-   * Create Sparse Matrix.
-   *
-   * After create sparse, sparseCopyFrom can be used to fill matrix.
-   *
-   * @param nnz  Number of non zero values.
-   *
-   * @note the default sparse type is SPARSE_CSR.
-   */
-  static Matrix* createSparse(size_t height,
-                              size_t width,
-                              size_t nnz,
-                              bool isNonVal = true,
-                              bool trans = false,
-                              bool useGpu = isUsingGpu());
-
-  /**
-   * Create Dense Matrix.
-   *
-   * @param data  list of float should be passed in python.
-   * @note        the value will be copy into a new matrix.
-   */
-  static Matrix* createDense(const std::vector<float>& data,
-                             size_t height,
-                             size_t width,
-                             bool useGpu = isUsingGpu());
-
-  static Matrix* createDenseFromNumpy(
-      float* data,
-      int dim1,
-      int dim2,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-
-  /**
-   *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
-   *
-   *  @param data  a numpy matrix.
-   *  @param dim1  dimension of data.
-   *  @param dim2  dimension of data.
-   *  @param copy  true if copy into a new matrix, false will create
-   *               matrix inplace. copy = false should be used with extreme
-   *               care because Matrix will share the memory with the given
-   *               numpy array. If the numpy array object is no longer valid,
-   *               the memory space will not be usable.
-   */
-  static Matrix* createCpuDenseFromNumpy(float* data,
-                                         int dim1,
-                                         int dim2,
-                                         bool copy = true);
-
-  /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
-  static Matrix* createGpuDenseFromNumpy(float* data, int dim1, int dim2);
-
-  /**
-   * Cast to numpy matrix.
-   *
-   * @note    This method take no parameter in python.
-   * @note    This method in python will return a numpy matrix, not void.
-   * @note    Only CpuDenseMatrix is supported.
-   *
-   * Example:
-   * @code
-   * import paddle
-   * m = paddle.Matrix.createZero(10,2)
-   * numpy_mat = m.toNumpyMat()
-   * @endcode
-   */
-  void toNumpyMatInplace(float** view_data,
-                         int* dim1,
-                         int* dim2) throw(UnsupportError);
-
-  /// Copy To numpy mat.
-  void copyToNumpyMat(float** view_m_data,
-                      int* dim1,
-                      int* dim2) throw(UnsupportError);
-
-  /// Copy From Numpy Mat
-  void copyFromNumpyMat(float* data, int dim1, int dim2) throw(UnsupportError,
-                                                               RangeError);
-
-  /// return true if this matrix is sparse.
-  bool isSparse() const;
-
-  SparseValueType getSparseValueType() const throw(UnsupportError);
-
-  SparseFormatType getSparseFormat() const throw(UnsupportError);
-
-  IntArray getSparseRowCols(size_t i) const throw(UnsupportError, RangeError);
-
-  IntWithFloatArray getSparseRowColsVal(size_t i) const
-      throw(UnsupportError, RangeError);
-
-  size_t getHeight() const;
-
-  size_t getWidth() const;
-
-  float get(size_t x, size_t y) const throw(RangeError);
-
-  void set(size_t x, size_t y, float val) throw(RangeError, UnsupportError);
-
-  /// return type is list of float
-  FloatArray getData() const;
-
-  /**
-   * Copy from rows, cols, values.
-   *
-   * if sparse_nonvalue, the values should be []
-   */
-  void sparseCopyFrom(const std::vector<int>& rows,
-                      const std::vector<int>& cols,
-                      const std::vector<float>& values =
-                          std::vector<float>()) throw(UnsupportError);
-
-  bool isGpu() const;
-
- private:
-  void* getSharedPtr() const;
-
-  MatrixPrivate* m;
-  friend class Trainer;
-  friend class GradientMachine;
-  friend class Arguments;
-};
-
-struct VectorPrivate;
-class Vector {
-  DISABLE_COPY(Vector);
-  Vector();
-  static Vector* createByPaddleVectorPtr(void* ptr);
-
-  void* getSharedPtr();
-
- public:
-  ~Vector();
-
-  /// Create Vector filled with zero.
-  static Vector* createZero(size_t sz, bool useGpu = isUsingGpu());
-
-  /**
-   * Create Vector from list of float.
-   *
-   * It will create a new vector, and copy data into it.
-   */
-  static Vector* create(const std::vector<float>& data,
-                        bool useGpu = isUsingGpu());
-
-  static Vector* createVectorFromNumpy(
-      float* data,
-      int dim,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-  /**
-   * Create Cpu Vector from numpy array, which dtype=float32
-   *
-   * If copy is false, it will create vector inplace.
-   */
-  static Vector* createCpuVectorFromNumpy(float* data,
-                                          int dim,
-                                          bool copy = true);
-
-  /// Create Gpu Vector from numpy array, which dtype=float32
-  static Vector* createGpuVectorFromNumpy(float* data, int dim);
-
-  /**
-   * copy from another vector
-   * throw(RangeError) if size of src vector is different from size of this
-   * vector
-   */
-  void copyFrom(Vector* src) throw(RangeError);
-
-  /// Cast to numpy array inplace.
-  void toNumpyArrayInplace(float** view_data, int* dim1) throw(UnsupportError);
-
-  /// Copy to numpy array.
-  void copyToNumpyArray(float** view_m_data, int* dim1);
-
-  /// Copy from numpy array.
-  void copyFromNumpyArray(float* data, int dim);
-
-  /// __getitem__ in python
-  float get(const size_t idx) const throw(RangeError, UnsupportError);
-
-  /// __setitem__ in python
-  void set(const size_t idx, float val) throw(RangeError, UnsupportError);
-
-  /// Return is GPU vector or not.
-  bool isGpu() const;
-
-  /// Return a list of float, the memory is alloced and copied.
-  FloatArray getData() const;
-
-  /// __len__ in python
-  size_t getSize() const;
-
- private:
-  VectorPrivate* m;
-
- private:
-  friend class Parameter;
-  friend class ParameterOptimizer;
-  friend struct ParameterTraverseCallbackPrivate;
-};
-
-struct IVectorPrivate;
-class IVector {
-  IVector();
-  DISABLE_COPY(IVector);
-  static IVector* createByPaddleVectorPtr(void* ptr);
-
- public:
-  /// Create IVector filled with zero
-  static IVector* createZero(size_t sz, bool useGpu = isUsingGpu());
-
-  /**
-   * Create IVector from list of int.
-   * It will create a new vector, and copy data into it.
-   */
-  static IVector* create(const std::vector<int>& data,
-                         bool useGpu = isUsingGpu());
-
-  static IVector* createVectorFromNumpy(
-      int* data,
-      int dim,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-
-  /**
-   * Create Cpu IVector from numpy array, which dtype=int32
-   *
-   * If copy is false, it will create vector inplace
-   */
-  static IVector* createCpuVectorFromNumpy(int* data,
-                                           int dim,
-                                           bool copy = true);
-  /**
-   * Create Gpu IVector from numpy array, which dtype=int32
-   */
-  static IVector* createGpuVectorFromNumpy(int* data, int dim);
-
-  /// Cast to numpy array inplace.
-  void toNumpyArrayInplace(int** view_data, int* dim1) throw(UnsupportError);
-
-  /// Copy to numpy array.
-  void copyToNumpyArray(int** view_m_data, int* dim1);
-
-  /// Copy from numpy array.
-  void copyFromNumpyArray(int* data, int dim);
-
-  virtual ~IVector();
-
-  /// Return a list of int, the memory is alloced and copied.
-  IntArray getData() const;
-
-  /// This method will map to python [] method.
-  int& operator[](const size_t idx) throw(RangeError, UnsupportError);
-
-  const int& operator[](const size_t idx) const
-      throw(RangeError, UnsupportError);
-
-  inline int get(const size_t idx) const throw(RangeError, UnsupportError) {
-    return (*this)[idx];
-  }
-
-  inline void set(const size_t idx, int val) throw(RangeError, UnsupportError) {
-    (*this)[idx] = val;
-  }
-
-  /// Return true if it is gpu vector.
-  bool isGpu() const;
-
-  /// This method will map to python __len__();
-  size_t getSize() const;
-
- private:
-  void* getSharedPtr() const;
-
-  friend class Arguments;
-  IVectorPrivate* m;
-};
-
-struct ArgumentsPrivate;
-
-/// The Arguments is actual a std::vector<paddle::Argument> in paddle.
-class Arguments {
- private:
-  Arguments();  // Internal Create.
-  DISABLE_COPY(Arguments);
-
- public:
-  /**
-   * Create a arguments with size.
-   * Note that it can be zero.
-   */
-  static Arguments* createArguments(size_t slotNum);
-
-  void resize(size_t slotNum);
-
-  virtual ~Arguments();
-
-  /**
-   * Return the slot number that aguments contains.
-   *
-   * It is actually the vector's size
-   */
-  size_t getSlotNum() const;
-
-  /**
-   * The get functions of Arguments
-   *
-   * the param idx is the slot id
-   */
-  Matrix* getSlotValue(size_t idx) const throw(RangeError);
-  Matrix* getSlotGrad(size_t idx) const throw(RangeError);
-  IVector* getSlotIds(size_t idx) const throw(RangeError);
-  Matrix* getSlotIn(size_t idx) const throw(RangeError);
-  IVector* getSlotSequenceStartPositions(size_t idx) const throw(RangeError);
-  IVector* getSlotSubSequenceStartPositions(size_t idx) const throw(RangeError);
-  IVector* getSlotSequenceDim(size_t idx) const throw(RangeError);
-  // End Of get functions of Arguments
-
-  int64_t getBatchSize(size_t idx = 0) const throw(RangeError);
-
-  /**
-   * The set functions of Arguments.
-   *
-   * The param idx is the slot id.
-   * The other param is the input Matrix or vector.
-   */
-  void setSlotValue(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotGrad(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotIn(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotIds(size_t idx, IVector* vec) throw(RangeError);
-  void setSlotSequenceStartPositions(size_t idx,
-                                     IVector* vec) throw(RangeError);
-  void setSlotSubSequenceStartPositions(size_t idx,
-                                        IVector* vec) throw(RangeError);
-  void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
-
-  /**
-   * Set the frame height of the idx-th Argument.
-   *
-   * @param ids The index of which Argument.
-   * @param h The height value.
-   */
-  void setSlotFrameHeight(size_t idx, size_t h) throw(RangeError);
-
-  /**
-   * Set the frame height of the idx-th Argument.
-   *
-   * @param ids The index of which Argument.
-   * @param h The height value.
-   */
-  void setSlotFrameWidth(size_t idx, size_t w) throw(RangeError);
-
-  size_t getSlotFrameHeight(size_t idx = 0) const throw(RangeError);
-  size_t getSlotFrameWidth(size_t idx = 0) const throw(RangeError);
-
-  float sum() const;
-
- private:
-  static Arguments* createByPaddleArgumentVector(void* ptr);
-  static Arguments* createByPaddleArgument(const void* ptr);
-  void* getInternalArgumentsPtr() const;
-
- private:
-  ArgumentsPrivate* m;
-  friend class Trainer;
-  friend class GradientMachine;
-  friend class SequenceGenerator;
-};
-
-enum GradientMatchineCreateMode {
-  CREATE_MODE_NORMAL = paddle::GradientMachine::kNormal,
-  CREATE_MODE_SGD_SPARSE_CPU_TRAINING =
-      paddle::GradientMachine::kSgdSparseCpuTraining,
-  CREATE_MODE_TESTING = paddle::GradientMachine::kTesting
-};
-
-struct ParameterConfigPrivate;
-class ParameterConfig {
-  DISABLE_COPY(ParameterConfig);
-  ParameterConfig();
-
-  /**
-   * Internal methods
-   */
-  static ParameterConfig* createParameterConfigFromParameterSharedPtr(
-      void* ptr);
-  static ParameterConfig* createParameterConfigFromParameterPtr(void* ptr);
-  void* getRawPtr();
-
- public:
-  ~ParameterConfig();
-
-  /**
-   * return proto buf string.
-   */
-  std::string toProtoString() const;
-
- private:
-  ParameterConfigPrivate* m;
-
- private:
-  friend class Parameter;
-  friend class ParameterOptimizer;
-  friend struct ParameterTraverseCallbackPrivate;
-};
-
-struct OptimizationConfigPrivate;
-class OptimizationConfig {
-  DISABLE_COPY(OptimizationConfig);
-  OptimizationConfig();
-
- public:
-  static OptimizationConfig* createFromProtoString(const std::string& str);
-  ~OptimizationConfig();
-
-  /**
-   * return protobuf string.
-   */
-  std::string toProtoString();
-
- private:
-  OptimizationConfigPrivate* m;
-
-  friend class TrainerConfig;
-  friend class ParameterOptimizer;
-  friend class ParameterUpdater;
-  friend class Trainer;
-};
-
-struct ParameterPrivate;
-class Parameter {
- private:
-  Parameter();
-  DISABLE_COPY(Parameter);
-
- public:
-  virtual ~Parameter();
-
-  /**
-   * get parameter name
-   */
-  std::string getName() const;
-
-  /**
-   * get buf in Parameter
-   */
-  Vector* getBuf(ParameterType type);
-
-  /**
-   * get id
-   */
-  size_t getID() const;
-
-  ParameterConfig* getConfig();
-  void setValueUpdated();
-
-  bool save(const std::string& filename) const;
-
-  bool load(const std::string& filename) const;
-
-  size_t getSize() const;
-
- private:
-  static Parameter* createFromRawPtr(void* ptr);
-  static Parameter* createFromSharedPtr(void* ptr);
-
- private:
-  ParameterPrivate* m;
-  friend class UpdateCallbackWrapper;
-  friend class GradientMachine;
-  friend class ParameterUpdater;
-};
-
-struct ModelConfigPrivate;
-/**
- * You can only get model config from TrainerConfig.
- *
- * It is used by GradientMachine.
- */
-class ModelConfig {
- private:
-  ModelConfig();
-  DISABLE_COPY(ModelConfig);
-
- public:
-  virtual ~ModelConfig();
-
- private:
-  ModelConfigPrivate* m;
-  friend class TrainerConfig;
-  friend struct TrainerConfigPrivate;
-  friend class GradientMachine;
-};
-
-struct TrainerConfigPrivate;
-/**
- * To get TrainerConfig from file.
- *
- * It is used by GradientMachine.
- */
-class TrainerConfig {
- private:
-  TrainerConfig();
-  DISABLE_COPY(TrainerConfig);
-
- public:
-  virtual ~TrainerConfig();
-
-  static TrainerConfig* createFromTrainerConfigFile(
-      const std::string& configPath);
-  static TrainerConfig* createFromProtoString(const std::string& str);
-
-  ModelConfig* getModelConfig() const;
-
-  OptimizationConfig* getOptimizationConfig() const;
-
- private:
-  TrainerConfigPrivate* m;
-  friend class Trainer;
-};
-
-/**
- * The callback in backword.
- *
- * You can inherit this class in python.
- *
- * @code
- * class UpdateCallbackInPython(paddle.UpdateCallback):
- *   def __init__(self):
- *     paddle.UpdateCallback.__init__(self)
- *
- *   def apply(self, param):
- *     assert isinstance(param, paddle.Parameter)
- * @endcode
- */
-class UpdateCallback {
- public:
-  virtual ~UpdateCallback();
-  virtual void apply(Parameter* p);
-};
-
-struct ParameterTraverseCallbackPrivate;
-class ParameterTraverseCallback {
-  DISABLE_COPY(ParameterTraverseCallback);
-  ParameterTraverseCallback();
-
- public:
-  ~ParameterTraverseCallback();
-
-  void apply(const std::vector<Vector*>& vecs,
-             const ParameterConfig& config,
-             size_t sparseId);
-
- private:
-  ParameterTraverseCallbackPrivate* m;
-  friend class ParameterOptimizer;
-};
-
-/**
- * The ParameterOptimizer Wrapper Class.
- *
- * Basically same as common/ParameterOptimizer.h
- */
-struct ParameterOptimizerPrivate;
-class ParameterOptimizer {
-  DISABLE_COPY(ParameterOptimizer);
-  ParameterOptimizer();
-
- public:
-  static ParameterOptimizer* create(OptimizationConfig* config);
-
-  ~ParameterOptimizer();
-
-  void init(size_t numRows, const ParameterConfig* config);
-
-  void startPass();
-
-  void finishPass();
-
-  void startBatch(size_t numSamplesProcessed);
-
-  void finishBatch();
-
-  void update(const std::vector<Vector*>& vecs,
-              const ParameterConfig& conf,
-              size_t sparseId = NO_SPARSE_ID);
-
-  std::vector<int> getParameterTypes() const;
-
-  ParameterTraverseCallback* needSpecialTraversal(
-      const ParameterConfig& config) const;
-
- private:
-  ParameterOptimizerPrivate* m;
-};
-
-class SequenceGenerator;
-class Evaluator;
-struct GradientMachinePrivate;
-class GradientMachine {
- private:
-  GradientMachine();
-  DISABLE_COPY(GradientMachine);
-
- public:
-  virtual ~GradientMachine();
-
-  /**
-   * Create By ProtoStr.
-   *
-   * The ProtoStr can be generate by python's protobuf code.
-   */
-  static GradientMachine* createByConfigProtoStr(
-      const std::string& protoStr,
-      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
-      const std::vector<int>& parameterTypes = defaultParamTypes);
-
-  /**
-   * Create by ModelConfig object.
-   *
-   * To get ModelConfig, you can get TrainerConfig from config file, then get
-   * model config by TrainerConfig
-   */
-  static GradientMachine* createByModelConfig(
-      ModelConfig* conf,
-      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
-      const std::vector<int>& parameterTypes = defaultParamTypes);
-
-  /**
-   * @brief finish
-   */
-  void finish();
-
-  void start();
-
-  /**
-   * Prefetch row ids of sparse parameter.
-   */
-  void prefetch(const Arguments& inArgs);
-
-  /**
-   * Do some thing when train pass ended.
-   */
-  void onPassEnd();
-
-  /**
-   * The forward stage of GradientMachine.
-   *
-   * @note  the outArgs could be zero length arguemnts.
-   * @note  THIS METHOD IS VERY USEFULL FOR PREDICT FROM TRAINED MODEL.
-   */
-  void forward(const Arguments& inArgs, Arguments* outArgs, PassType passType);
-
-  /**
-   * The backward stage of GradientMachine.
-   *
-   * @note  Currently the ParameterUpdater is not wrapped in SWIG, so backward
-   * cannot actually train a network. But you can write a update callback to
-   * change the parameter or implement a ParameterUpdater in python side.
-   */
-  void backward(const UpdateCallback& callback = UpdateCallback());
-
-  /**
-   * Combine forward/backward
-   */
-  void forwardBackward(const Arguments& inArgs,
-                       Arguments* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback = UpdateCallback());
-
-  void loadParameters(const std::string& path);
-
-  size_t getParameterSize() const;
-  Parameter* getParameter(size_t i) throw(RangeError);
-
-  size_t getNonStaticParameterSize() const;
-  Parameter* getNonStaticParameter(size_t i) throw(RangeError);
-
-  void randParameters();
-
-  Arguments* getLayerOutput(const std::string& layerName) const
-      throw(UnsupportError);
-
-  /**
-   * Create a sequence generator.
-   *
-   * @note  It just like a paddle_gen_sequence.
-   */
-  SequenceGenerator* asSequenceGenerator(
-      const std::vector<std::string>& dict = std::vector<std::string>(),
-      size_t begin_id = 0UL,
-      size_t end_id = 0UL,
-      size_t max_length = 100UL,
-      size_t beam_size = -1UL);
-
-  Evaluator* makeEvaluator();
-
-  void eval(Evaluator* evaluator);
-
- private:
-  GradientMachinePrivate* m;
-
-  static GradientMachine* createFromPaddleModelPtr(
-      const void* confPtr,
-      GradientMatchineCreateMode mode,
-      const std::vector<int>& types);
-
-  // Not to use c++ 11 init-list, so we use static var as function default arg.
-  static std::vector<int> defaultParamTypes;
-  friend class Trainer;
-  friend class ParameterUpdater;
-};
-
-struct ParameterUpdaterPrivate;
-class ParameterUpdater {
- private:
-  ParameterUpdater();
-
- public:
-  static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
-  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
-                                               int passCount,
-                                               bool useSparseUpdater);
-  static ParameterUpdater* createNewRemoteUpdater(
-      OptimizationConfig* config,
-      const std::string pserverSpec,
-      const bool useEtcd) throw(UnsupportError);
-  ~ParameterUpdater();
-
-  /**
-   * @brief initialize Parameter Updater by GradientMachine.
-   * @param gm
-   */
-  void init(const GradientMachine& gm);
-
-  /**
-   * @brief begin of a training/testing of one pass.
-   */
-  void startPass();
-
-  /**
-   * @brief end of a traning/testing of one pass.
-   */
-  void finishPass();
-
-  /**
-   * @brief begin of a training/testing of one batch.
-   * @param data batch's size
-   * @return PassType, mostly will be training.
-   */
-  PassType startBatch(size_t batchSize);
-
-  /**
-   * @brief end of a traning/testing of one batch
-   * @param cost current batch cost.
-   */
-  void finishBatch(float cost);
-
-  /**
-   * @brief update a parameter (by local optimizer or by cluster pserver)
-   * @param param
-   */
-  void update(Parameter* param);
-
-  /**
-   * @breif only get required sparse rows by default.
-   * @param fullSize: get full matrix parameter if *fullSize* set
-   * @param apply: get PARAMETER_APPLY on pserver if *apply* set
-   */
-  void getParametersRemote(bool fullSize = false, bool apply = false);
-
-  /**
-   * @brief restore the average parameter.
-   * @note It is only used in AverageOptimizer. Restore will get the current
-   * PARAMETER_VALUE back.
-   */
-  void restore();
-
-  /**
-   * @brief apply. Store the average parameter.
-   * @note It is only used in AverageOptimizer. Apply will store the current
-   * PARAMETER_VALUE to buffer, calcaualte current Average Parameter, and save
-   * it to PARAMETER_VALUE.
-   */
-  void apply();
-
-  /**
-   * @brief catchUpWith The Regularization will be delayed in many situations(
-   * pserver, local sparse). Catch Up means catch the regularization up, apply
-   * regularization to all params.
-   */
-  void catchUpWith();
-
- private:
-  ParameterUpdaterPrivate* m;
-};
-
-struct EvaluatorPrivate;
-class Evaluator {
- private:
-  Evaluator();
-  DISABLE_COPY(Evaluator);
-
- public:
-  ~Evaluator();
-
-  /**
-   * @brief begin an evaluate stage.
-   */
-  void start();
-
-  /**
-   * @brief end an evaluate stage.
-   */
-  void finish();
-
-  /**
-   * @brief toString will get a evaluate result.
-   *
-   * __repr__ method in python
-   */
-  std::string toString();
-
-  std::vector<std::string> getNames() const;
-
-  double getValue(const std::string name) const;
-
- private:
-  EvaluatorPrivate* m;
-
-  friend class GradientMachine;
-};
-
-struct TrainerPrivate;
-class Trainer {
- private:
-  TrainerPrivate* m;
-  Trainer();
-  Trainer(TrainerConfig* optConfig, GradientMachine* gm);
-  DISABLE_COPY(Trainer);
-
- public:
-  virtual ~Trainer();
-
-  /// Create A Trainer By TrainerConfig. using paddle command line.
-  static Trainer* createByCommandLine() throw(IOError);
-
-  static Trainer* create(TrainerConfig* optConfig,
-                         GradientMachine* gm) throw(IOError);
-
-  /// Start training
-  void startTrain();
-
-  /// Finish training
-  void finishTrain();
-
-  /// Start a pass.
-  void startTrainPass();
-
-  /// Finish a pass
-  void finishTrainPass();
-
-  /**
-   * Train one batch,
-   *
-   * @return true if all batch finished.
-   */
-  bool trainOneBatch(size_t batchSize);
-
-  void trainOneDataBatch(size_t batchSize, const Arguments& args);
-
-  void startTestPeriod();
-  void testOneDataBatch(size_t batchSize, const Arguments& args);
-  void finishTestPeriod();
-
-  void forwardOneBatch(size_t batchSize);
-
-  Arguments* getForwardOutput();
-
-  Arguments* getLayerOutput(const std::string& layerName) const;
-};
-
-/// the N-Best results generated from one input sequence.
-class ISequenceResults {
- public:
-  virtual ~ISequenceResults();
-
-  /// Number of result.
-  virtual size_t getSize() const = 0;
-
-  /**
-   * Get sentence from dictionary.
-   *
-   * @param id  the index of result.
-   * @param split  if true, the return sentence will be splited with ' ' by
-   *               each word. Default is false.
-   */
-  virtual std::string getSentence(size_t id, bool split = false) const
-      throw(RangeError) = 0;
-  virtual std::vector<int> getSequence(size_t id) const throw(RangeError) = 0;
-  virtual float getScore(size_t id) const throw(RangeError) = 0;
-};
-
-struct SequenceGeneratorPrivate;
-class SequenceGenerator {
-  DISABLE_COPY(SequenceGenerator);
-  SequenceGenerator();
-
- public:
-  virtual ~SequenceGenerator();
-
-  /**
-   * Generate Sequence by input.
-   *
-   * @note  The inArgs is just one sequence of data.
-   * @note  The return will get a N-best generate result by inArgs.
-   *        Sort by score.
-   */
-  ISequenceResults* generateSequence(const Arguments& inArgs) const;
-
-  void setDict(const std::vector<std::string>& dict);
-  void setBos(size_t bos);
-  void setEos(size_t eos);
-  void setMaxLength(size_t maxlength);
-  void setBeamSize(size_t beamSize);
-
- private:
-  static SequenceGenerator* createByGradientMachineSharedPtr(void* ptr);
-  friend class GradientMachine;
-
- private:
-  SequenceGeneratorPrivate* m;
-};
diff --git a/paddle/api/PaddleAPIPrivate.h b/paddle/api/PaddleAPIPrivate.h
deleted file mode 100644
index e141fcd761d7db2d3836a6343700ac4a7ca80c16..0000000000000000000000000000000000000000
--- a/paddle/api/PaddleAPIPrivate.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <memory>
-#include "PaddleAPI.h"
-#include "paddle/gserver/evaluators/Evaluator.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/parameter/ParameterUpdaterBase.h"
-#include "paddle/trainer/TrainerConfigHelper.h"
-
-struct GradientMachinePrivate {
-  std::shared_ptr<paddle::GradientMachine> machine;
-
-  template <typename T>
-  inline T& cast(void* ptr) {
-    return *(T*)(ptr);
-  }
-};
-
-struct OptimizationConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> trainer_config;
-  paddle::OptimizationConfig config;
-
-  const paddle::OptimizationConfig& getConfig() {
-    if (trainer_config != nullptr) {
-      return trainer_config->getOptConfig();
-    } else {
-      return config;
-    }
-  }
-};
-
-struct TrainerConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> conf;
-  TrainerConfigPrivate() {}
-};
-
-struct ModelConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> conf;
-};
-
-struct ArgumentsPrivate {
-  std::vector<paddle::Argument> outputs;
-
-  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
-    if (idx < outputs.size()) {
-      return outputs[idx];
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-
-  template <typename T>
-  std::shared_ptr<T>& cast(void* rawPtr) const {
-    return *(std::shared_ptr<T>*)(rawPtr);
-  }
-};
-
-struct ParameterUpdaterPrivate {
-  std::unique_ptr<paddle::ParameterUpdater> updater;
-};
-
-struct ParameterPrivate {
-  std::shared_ptr<paddle::Parameter> sharedPtr;
-  paddle::Parameter* rawPtr;  // rawPtr only used in ParameterUpdater,
-                              // in other situation sharedPtr should
-                              // contains value.
-
-  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
-
-  paddle::Parameter* getPtr() {
-    if (sharedPtr) {
-      return sharedPtr.get();
-    } else {
-      return rawPtr;
-    }
-  }
-};
-
-struct EvaluatorPrivate {
-  paddle::Evaluator* rawPtr;
-
-  EvaluatorPrivate() : rawPtr(nullptr) {}
-  ~EvaluatorPrivate() { delete rawPtr; }
-};
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
deleted file mode 100644
index 589d22e74e742de2595a9efd17412ddc55159230..0000000000000000000000000000000000000000
--- a/paddle/api/Parameter.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/parameter/Parameter.h"
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-Parameter::Parameter() : m(new ParameterPrivate()) {}
-
-Parameter::~Parameter() { delete m; }
-
-Parameter* Parameter::createFromRawPtr(void* ptr) {
-  auto p = new Parameter();
-  p->m->rawPtr = *static_cast<paddle::Parameter**>(ptr);
-  return p;
-}
-
-Parameter* Parameter::createFromSharedPtr(void* ptr) {
-  auto& p = *(paddle::ParameterPtr*)(ptr);
-  if (p == nullptr) {
-    return nullptr;
-  } else {
-    auto retParam = new Parameter();
-    retParam->m->sharedPtr = p;
-    return retParam;
-  }
-}
-
-std::string Parameter::getName() const { return m->getPtr()->getName(); }
-
-Vector* Parameter::getBuf(ParameterType type) {
-  auto buf = m->getPtr()->getBuf(type);
-  return Vector::createByPaddleVectorPtr(&buf);
-}
-
-ParameterConfig* Parameter::getConfig() {
-  if (m->sharedPtr) {
-    return ParameterConfig::createParameterConfigFromParameterSharedPtr(
-        &m->sharedPtr);
-  } else {
-    return ParameterConfig::createParameterConfigFromParameterPtr(m->rawPtr);
-  }
-}
-
-size_t Parameter::getID() const { return m->getPtr()->getID(); }
-
-void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
-
-bool Parameter::save(const std::string& filename) const {
-  return m->getPtr()->save(filename);
-}
-
-bool Parameter::load(const std::string& filename) const {
-  return m->getPtr()->load(filename);
-}
-
-size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
deleted file mode 100644
index d4620be3e6f26cdd4caffffac712e4ef936b222a..0000000000000000000000000000000000000000
--- a/paddle/api/ParameterOptimizer.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/parameter/ParameterOptimizer.h"
-#include <algorithm>
-#include "Internal.h"
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-struct ParameterOptimizerPrivate {
-  std::unique_ptr<paddle::ParameterOptimizer> optimizer;
-};
-
-struct ParameterTraverseCallbackPrivate {
-  paddle::ParameterOptimizer::TraverseCallback callback;
-
-  ParameterTraverseCallbackPrivate() {}
-
-  ParameterTraverseCallbackPrivate(
-      const paddle::ParameterOptimizer::TraverseCallback& callback)
-      : callback(callback) {}
-
-  void apply(const std::vector<Vector*>& vecs,
-             const ParameterConfig& conf,
-             size_t sparseId) {
-    std::vector<paddle::VectorPtr> real_vecs;
-    real_vecs.resize(vecs.size());
-    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
-      if (v) {
-        return *(paddle::VectorPtr*)(v->getSharedPtr());
-      } else {
-        return paddle::VectorPtr();
-      }
-    });
-
-    paddle::ParameterConfig& real_conf =
-        *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
-                                        .getRawPtr());
-    callback(real_vecs.data(), real_conf, sparseId);
-  }
-};
-
-ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
-
-ParameterOptimizer::~ParameterOptimizer() { delete m; }
-
-ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
-  CHECK(config != nullptr);
-  auto retOptimizer = new ParameterOptimizer();
-  retOptimizer->m->optimizer.reset(
-      paddle::ParameterOptimizer::create(config->m->getConfig(), false));
-  return retOptimizer;
-}
-
-void ParameterOptimizer::init(size_t numRows, const ParameterConfig* config) {
-  auto& conf = *(paddle::ParameterConfig*)(const_cast<ParameterConfig*>(config)
-                                               ->getRawPtr());
-  m->optimizer->init(numRows, &conf);
-}
-
-void ParameterOptimizer::startPass() { m->optimizer->startPass(); }
-
-void ParameterOptimizer::finishPass() { m->optimizer->finishPass(); }
-
-void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
-  constexpr size_t high_1 = 1UL << (sizeof(size_t) * 8 - 1);
-  CHECK_EQ(numSamplesProcessed & high_1, 0UL);  // Safely cast.
-  m->optimizer->startBatch((int64_t)numSamplesProcessed);
-}
-
-void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
-
-void ParameterOptimizer::update(const std::vector<Vector*>& vecs,
-                                const ParameterConfig& conf,
-                                size_t sparseId) {
-  ParameterTraverseCallbackPrivate invoker(
-      [&](const paddle::VectorPtr _vecs[],
-          const paddle::ParameterConfig& config,
-          size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
-  invoker.apply(vecs, conf, sparseId);
-}
-
-std::vector<int> ParameterOptimizer::getParameterTypes() const {
-  std::vector<int> returnValue;
-  staticCastVector(&returnValue, m->optimizer->getParameterTypes());
-  return returnValue;
-}
-
-ParameterTraverseCallback::ParameterTraverseCallback()
-    : m(new ParameterTraverseCallbackPrivate()) {}
-
-ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
-
-void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
-                                      const ParameterConfig& conf,
-                                      size_t sparseId) {
-  m->apply(vecs, conf, sparseId);
-}
-
-ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  auto& param_config =
-      *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(config)
-           .getRawPtr();
-  auto callback = m->optimizer->needSpecialTraversal(param_config);
-  if (callback) {
-    auto retCallback = new ParameterTraverseCallback();
-    retCallback->m->callback = callback;
-    return retCallback;
-  } else {
-    return nullptr;
-  }
-}
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
deleted file mode 100644
index 1446c3084238859a759669f3a32c7efde67dcc2b..0000000000000000000000000000000000000000
--- a/paddle/api/SequenceGenerator.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <iterator>
-#include <sstream>
-#include <vector>
-#include "PaddleAPI.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/parameter/Argument.h"
-#include "paddle/utils/Flags.h"
-
-// used to represent partial sequence
-struct Path {
-  std::vector<int> ids;
-  float logProb;
-  paddle::MachineState machineState;
-
-  Path() { logProb = 0; }
-
-  Path(std::vector<int>& ids, float logProb, paddle::MachineState& machineState)
-      : ids(ids), logProb(logProb), machineState(machineState) {}
-
-  bool operator<(const Path& other) const { return (logProb > other.logProb); }
-};
-
-// Return top k (k == beam_size) optimal paths using beam search. The last
-// element of inArgs is the Argument of feedback. gradMachine has MaxIdLayer
-// as output and outArgs thus stores top k labels and their probabilities per
-// position
-static void findNBest(paddle::GradientMachine* gradMachine,
-                      std::vector<paddle::Argument>& inArgs,
-                      std::vector<Path>& finalPaths,
-                      size_t bos_id,
-                      size_t eos_id,
-                      size_t max_length) {
-  std::vector<Path> paths;
-  Path emptyPath;
-  paths.push_back(emptyPath);
-  finalPaths.clear();
-  gradMachine->resetState();
-  paddle::Argument feedback = inArgs.back();
-  feedback.ids->setElement(0, (int)(bos_id));
-  float minFinalPathLogProb = 0;
-  size_t beam = 0;
-  int id;
-  std::vector<paddle::Argument> outArgs;
-  while (true) {  // iterate over each generated word
-    std::vector<Path> newPaths;
-    paddle::MachineState machineState;
-    for (size_t j = 0; j < paths.size(); j++) {
-      Path& path = paths[j];
-      if (path.machineState.size() > 0) {
-        gradMachine->setState(path.machineState);
-        feedback.ids->setElement(0, path.ids.back());
-      }
-      gradMachine->forward(inArgs, &outArgs, paddle::PASS_TEST);
-      gradMachine->getState(machineState);
-      beam = outArgs[0].ids->getSize();
-      for (size_t k = 0; k < beam; k++) {
-        id = outArgs[0].ids->getElement(k);
-        float prob = outArgs[0].in->getElement(0, k);
-        std::vector<int> nids(path.ids);
-        nids.push_back(id);
-        float newLogProb = path.logProb + log(prob);
-        Path newPath(nids, newLogProb, machineState);
-        if (id == (int)eos_id || nids.size() >= max_length) {
-          finalPaths.push_back(newPath);
-          if (minFinalPathLogProb > newPath.logProb) {
-            minFinalPathLogProb = newPath.logProb;
-          }
-        } else {
-          newPaths.push_back(newPath);
-        }
-      }
-    }
-
-    if (newPaths.size() == 0) {
-      break;
-    }
-    std::nth_element(newPaths.begin(),
-                     newPaths.begin() + std::min(beam, newPaths.size()),
-                     newPaths.end());
-    if (newPaths.size() > beam) {
-      newPaths.resize(beam);
-    }
-    // pathA < pathB means pathA.logProb > pathB.logProb
-    float maxPathLogProb =
-        std::min_element(newPaths.begin(), newPaths.end())->logProb;
-    if (finalPaths.size() >= beam && minFinalPathLogProb >= maxPathLogProb) {
-      break;
-    }
-    paths = newPaths;
-  }  // end while
-
-  std::partial_sort(finalPaths.begin(),
-                    finalPaths.begin() + std::min(beam, finalPaths.size()),
-                    finalPaths.end());
-  if (finalPaths.size() > beam) {
-    finalPaths.resize(beam);
-  }
-}
-
-struct SequenceGeneratorPrivate {
-  std::shared_ptr<paddle::GradientMachine> machine;
-  std::shared_ptr<std::vector<std::string>> dict;
-  size_t beginPos;
-  size_t endPos;
-  size_t maxLength;
-
-  paddle::Argument feedback;
-
-  template <typename T>
-  inline T& cast(void* ptr) {
-    return *(T*)(ptr);
-  }
-
-  inline void findNBest(std::vector<paddle::Argument>& inArgs,
-                        std::vector<Path>& path) {
-    ::findNBest(machine.get(), inArgs, path, beginPos, endPos, maxLength);
-  }
-
-  SequenceGeneratorPrivate()
-      : dict(std::make_shared<std::vector<std::string>>()),
-        beginPos(0UL),
-        endPos(0UL),
-        maxLength(0UL),
-        feedback(__create_feedback__()) {}
-
- private:
-  static paddle::Argument __create_feedback__() {
-    paddle::Argument feedback;
-    feedback.ids = paddle::IVector::create(/* size= */ 1, FLAGS_use_gpu);
-
-    feedback.sequenceStartPositions =
-        paddle::ICpuGpuVector::create(/* size= */ 2, /* useGpu= */ false);
-    feedback.sequenceStartPositions->getMutableData(false)[0] = 0;
-    feedback.sequenceStartPositions->getMutableData(false)[1] = 1;
-    return feedback;
-  }
-};
-
-SequenceGenerator::SequenceGenerator() : m(new SequenceGeneratorPrivate()) {}
-
-SequenceGenerator::~SequenceGenerator() { delete m; }
-
-class PathSequenceResults : public ISequenceResults {
-  // ISequenceResults interface
- public:
-  PathSequenceResults(const std::shared_ptr<std::vector<Path>>& path,
-                      const std::shared_ptr<std::vector<std::string>>& dict)
-      : path_(path), dict_(dict) {}
-
-  size_t getSize() const { return path_->size(); }
-  std::string getSentence(size_t id, bool split) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      std::ostringstream sout;
-      std::transform(p.ids.begin(),
-                     p.ids.end(),
-                     std::ostream_iterator<std::string>(sout, split ? " " : ""),
-                     [&](int id) { return (*dict_)[id]; });
-      return sout.str();
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-  std::vector<int> getSequence(size_t id) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      return p.ids;
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-  float getScore(size_t id) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      return p.logProb;
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-
- private:
-  std::shared_ptr<std::vector<Path>> path_;
-  std::shared_ptr<std::vector<std::string>> dict_;
-};
-
-ISequenceResults* SequenceGenerator::generateSequence(
-    const Arguments& inArgs) const {
-  auto& in_args =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  for (auto& arg : in_args) {
-    arg.sequenceStartPositions = m->feedback.sequenceStartPositions;
-  }
-  in_args.push_back(m->feedback);
-  auto path = std::make_shared<std::vector<Path>>();
-  m->findNBest(in_args, *path);
-  return new PathSequenceResults(path, m->dict);
-}
-
-SequenceGenerator* SequenceGenerator::createByGradientMachineSharedPtr(
-    void* ptr) {
-  SequenceGenerator* r = new SequenceGenerator();
-  r->m->machine = r->m->cast<std::shared_ptr<paddle::GradientMachine>>(ptr);
-  return r;
-}
-
-void SequenceGenerator::setDict(const std::vector<std::string>& dict) {
-  *m->dict = dict;
-}
-
-void SequenceGenerator::setBos(size_t bos) { m->beginPos = bos; }
-
-void SequenceGenerator::setEos(size_t eos) { m->endPos = eos; }
-
-void SequenceGenerator::setMaxLength(size_t maxLength) {
-  m->maxLength = maxLength;
-}
-
-void SequenceGenerator::setBeamSize(size_t beamSize) {
-  if (beamSize != -1UL) {
-    FLAGS_beam_size = beamSize;
-  }
-}
-
-ISequenceResults::~ISequenceResults() {}
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
deleted file mode 100644
index 795460b65051b4ec0d9772d2503f123c4a6ea3d0..0000000000000000000000000000000000000000
--- a/paddle/api/Trainer.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include <stdlib.h>
-#include <atomic>
-#include <memory>
-
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/trainer/ParamUtil.h"
-#include "paddle/trainer/Trainer.h"
-#include "paddle/trainer/TrainerInternal.h"
-#include "paddle/utils/Flags.h"
-
-using paddle::real;
-
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_int32(start_pass);
-
-struct TrainerPrivate : public paddle::Trainer {
-  bool _trainOneBatch(size_t batchSize);
-  bool forwardOneBatch(size_t batchSize);
-  void forwardOneDataBatch(const std::vector<paddle::Argument>& inArgs);
-  void setBatchSize(size_t batchSize);
-  std::vector<paddle::Argument>& getForwardOutput();
-
-  void startTestPeriod();
-  void finishTestPeriod();
-  void testOneDataBatch(const paddle::DataBatch& dataBatch);
-  TrainerPrivate() : paddle::Trainer() {}
-};
-
-Trainer::Trainer() : m(new TrainerPrivate()) {
-  auto conf = paddle::TrainerConfigHelper::createFromFlags();
-  if (conf != nullptr) {
-    m->init(conf);
-  }
-}
-
-Trainer::~Trainer() { delete m; }
-
-Trainer* Trainer::createByCommandLine() throw(IOError) {
-  auto retv = new Trainer();
-  if (retv->m->getConfig().IsInitialized()) {
-    return retv;
-  } else {
-    throw IOError();
-  }
-}
-
-Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
-    : m(new TrainerPrivate()) {
-  m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
-}
-
-Trainer* Trainer::create(TrainerConfig* config,
-                         GradientMachine* gm) throw(IOError) {
-  auto retv = new Trainer(config, gm);
-  if (retv->m->getConfig().IsInitialized()) {
-    return retv;
-  } else {
-    retv->m->getConfig().CheckInitialized();
-    throw IOError();
-  }
-}
-
-void Trainer::startTrain() { m->startTrain(); }
-
-void Trainer::finishTrain() { m->finishTrain(); }
-
-void Trainer::startTrainPass() { m->startTrainPass(); }
-
-void Trainer::finishTrainPass() { m->finishTrainPass(); }
-
-void Trainer::trainOneDataBatch(size_t batchSize, const Arguments& inArgs) {
-  paddle::DataBatch dataBatch;
-  dataBatch.getStreams() = inArgs.m->outputs;
-  dataBatch.setSize(batchSize);
-  m->trainOneDataBatch(dataBatch);
-}
-
-bool Trainer::trainOneBatch(size_t batchSize) {
-  return m->_trainOneBatch(batchSize);
-}
-
-bool TrainerPrivate::_trainOneBatch(size_t batchSize) {
-  paddle::DataBatch dataBatch;
-  CHECK(dataProvider_) << "data_provider is not specified";
-  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-  if (num == 0) {
-    return false;
-  }
-  trainOneDataBatch(dataBatch);
-  return false;
-}
-
-void TrainerPrivate::startTestPeriod() {
-  if (!tester_) {
-    createTester();
-  }
-  tester_->startTestPeriod();
-}
-
-void Trainer::startTestPeriod() { m->startTestPeriod(); }
-
-void TrainerPrivate::testOneDataBatch(const paddle::DataBatch& dataBatch) {
-  tester_->testOneDataBatch(dataBatch, &forwardOutput_);
-}
-
-void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
-  paddle::DataBatch dataBatch;
-  dataBatch.getStreams() = args.m->outputs;
-  dataBatch.setSize(batchSize);
-  m->testOneDataBatch(dataBatch);
-}
-
-void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
-void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
-
-Arguments* Trainer::getLayerOutput(const std::string& layerName) const {
-  auto nn = this->m->getGradientMachine();
-  CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
-  auto arg = nn->getLayerOutput(layerName);
-  return Arguments::createByPaddleArgument(&arg);
-}
-
-void Trainer::forwardOneBatch(size_t batchSize) {
-  m->forwardOneBatch(batchSize);
-}
-
-bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
-  CHECK(dataProvider_) << "data_provider is not specified";
-  paddle::DataBatch dataBatch;
-  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-  if (num == 0) {
-    return false;
-  }
-
-  forwardOneDataBatch(dataBatch.getStreams());
-  return true;
-}
-
-void TrainerPrivate::forwardOneDataBatch(
-    const std::vector<paddle::Argument>& inArgs) {
-  std::vector<paddle::Argument>& outArgs = forwardOutput_;
-
-  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    trainerInternal_.getGradientMachine()->prefetch(inArgs);
-    trainerInternal_.getParameterUpdater()->getParametersRemote();
-  }
-  trainerInternal_.getGradientMachine()->forward(
-      inArgs, &outArgs, paddle::PASS_TEST);
-}
-
-Arguments* Trainer::getForwardOutput() {
-  return Arguments::createByPaddleArgumentVector(&m->getForwardOutput());
-}
-
-std::vector<paddle::Argument>& TrainerPrivate::getForwardOutput() {
-  return forwardOutput_;
-}
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
deleted file mode 100644
index 618e87e96459674302d8b468c3ac410e8d3af6a8..0000000000000000000000000000000000000000
--- a/paddle/api/Util.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-
-#include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Util.h"
-
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-
-void initPaddle(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-}
-
-FloatArray::FloatArray(const float* b, const size_t l)
-    : buf(b), length(l), needFree(false) {}
-
-IntArray::IntArray(const int* b, const size_t l, bool f)
-    : buf(b), length(l), needFree(f) {}
-
-IntWithFloatArray::IntWithFloatArray(const float* v,
-                                     const int* i,
-                                     size_t l,
-                                     bool f)
-    : valBuf(v), idxBuf(i), length(l), needFree(f) {}
-
-bool isUsingGpu() { return FLAGS_use_gpu; }
-
-void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
-
-bool isGpuVersion() {
-#ifndef PADDLE_WITH_CUDA
-  return false;
-#else
-  return true;
-#endif
-}
-
-int getTrainerCount() { return FLAGS_trainer_count; }
-
-static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
-              "The Parameter Type should be same in core/api and core/common");
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
deleted file mode 100644
index e2a7b974ca78ae3e6e0e66c206a40c8811126b53..0000000000000000000000000000000000000000
--- a/paddle/api/Vector.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-
-#include "paddle/math/Vector.h"
-
-#include <cstring>
-
-struct IVectorPrivate {
-  paddle::IVectorPtr vec;
-};
-
-IVector::IVector() : m(new IVectorPrivate()) {}
-
-IVector* IVector::createZero(size_t sz, bool useGpu) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(sz, useGpu);
-  v->m->vec->zeroMem();
-  return v;
-}
-
-IVector* IVector::create(const std::vector<int>& data, bool useGpu) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(data.size(), useGpu);
-  v->m->vec->copyFrom(data.data(), data.size());
-  return v;
-}
-
-IVector* IVector::createVectorFromNumpy(int* data,
-                                        int dim,
-                                        bool copy,
-                                        bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// if use gpu only copy=true is supported
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return IVector::createGpuVectorFromNumpy(data, dim);
-  } else {
-    return IVector::createCpuVectorFromNumpy(data, dim, copy);
-  }
-}
-
-IVector* IVector::createCpuVectorFromNumpy(int* data, int dim, bool copy) {
-  auto v = new IVector();
-  if (copy) {
-    v->m->vec = paddle::IVector::create(dim, false);
-    v->m->vec->copyFrom(data, dim);
-  } else {
-    v->m->vec = paddle::IVector::create(data, dim, false);
-  }
-  return v;
-}
-
-IVector* IVector::createGpuVectorFromNumpy(int* data, int dim) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(dim, true);
-  v->m->vec->copyFrom(data, dim);
-  return v;
-}
-
-bool IVector::isGpu() const {
-  return dynamic_cast<paddle::GpuIVector*>(m->vec.get()) != nullptr;
-}
-
-IntArray IVector::getData() const {
-  if (this->isGpu()) {
-    int* src = m->vec->getData();
-    size_t len = m->vec->getSize();
-    int* dest = new int[len];
-    hl_memcpy_device2host(dest, src, len * sizeof(int));
-    return IntArray(dest, len, true);
-  } else {
-    return IntArray(m->vec->getData(), m->vec->getSize());
-  }
-}
-
-int& IVector::operator[](const size_t idx) throw(RangeError, UnsupportError) {
-  if (this->isGpu()) {
-    UnsupportError e;
-    throw e;
-  } else {
-    if (idx >= m->vec->getSize()) {
-      RangeError e;
-      throw e;
-    }
-  }
-  return m->vec->getData()[idx];
-}
-
-const int& IVector::operator[](const size_t idx) const
-    throw(RangeError, UnsupportError) {
-  return (*const_cast<IVector*>(this))[idx];
-}
-
-IVector* IVector::createByPaddleVectorPtr(void* ptr) {
-  auto* p = (paddle::IVectorPtr*)ptr;
-  if ((*p) != nullptr) {
-    IVector* vec = new IVector();
-    vec->m->vec = *p;
-    return vec;
-  } else {
-    return nullptr;
-  }
-}
-
-IVector::~IVector() { delete m; }
-
-void* IVector::getSharedPtr() const { return &m->vec; }
-
-size_t IVector::getSize() const { return m->vec->getSize(); }
-
-void IVector::toNumpyArrayInplace(int** data, int* dim1) throw(UnsupportError) {
-  auto v = std::dynamic_pointer_cast<paddle::CpuIVector>(m->vec);
-  if (v) {
-    *data = v->getData();
-    *dim1 = v->getSize();
-  } else {
-    throw UnsupportError();
-  }
-}
-
-void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
-  *dim1 = m->vec->getSize();
-  *view_m_data = new int[*dim1];
-  if (auto cpuVec = dynamic_cast<paddle::CpuIVector*>(m->vec.get())) {
-    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
-  } else if (auto gpuVec = dynamic_cast<paddle::GpuIVector*>(m->vec.get())) {
-    hl_memcpy_device2host(
-        *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
-  } else {
-    LOG(INFO) << "Unexpected situation";
-  }
-}
-
-void IVector::copyFromNumpyArray(int* data, int dim) {
-  m->vec->resize(dim);
-  m->vec->copyFrom(data, dim);
-}
-
-struct VectorPrivate {
-  paddle::VectorPtr vec;
-
-  void safeAccessData(const size_t idx,
-                      const std::function<void(float&)>& func) const
-      throw(RangeError, UnsupportError) {
-    auto cpuVec = std::dynamic_pointer_cast<const paddle::CpuVector>(vec);
-    if (cpuVec != nullptr) {
-      if (idx < vec->getSize()) {
-        func(vec->getData()[idx]);
-      } else {
-        throw RangeError();
-      }
-    } else {
-      throw UnsupportError();
-    }
-  }
-};
-
-Vector::Vector() : m(new VectorPrivate()) {}
-
-Vector::~Vector() { delete m; }
-
-Vector* Vector::createZero(size_t sz, bool useGpu) {
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create(sz, useGpu);
-  retVec->m->vec->zero();
-  return retVec;
-}
-
-Vector* Vector::create(const std::vector<float>& data, bool useGpu) {
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create(data.size(), useGpu);
-  retVec->m->vec->copyFrom(data.data(), data.size());
-  return retVec;
-}
-
-Vector* Vector::createByPaddleVectorPtr(void* ptr) {
-  auto& v = *(paddle::VectorPtr*)(ptr);
-  if (v == nullptr) {
-    return nullptr;
-  } else {
-    auto retVec = new Vector();
-    retVec->m->vec = v;
-    return retVec;
-  }
-}
-
-Vector* Vector::createVectorFromNumpy(float* data,
-                                      int dim,
-                                      bool copy,
-                                      bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// if use gpu only copy=True is supported
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return Vector::createGpuVectorFromNumpy(data, dim);
-  } else {
-    return Vector::createCpuVectorFromNumpy(data, dim, copy);
-  }
-}
-
-Vector* Vector::createCpuVectorFromNumpy(float* data, int dim, bool copy) {
-  CHECK_GT(dim, 0);
-  auto retVec = new Vector();
-  if (copy) {
-    retVec->m->vec = paddle::Vector::create((size_t)dim, false);
-    retVec->m->vec->copyFrom(data, dim);
-  } else {
-    retVec->m->vec = paddle::Vector::create(data, (size_t)dim, false);
-  }
-  return retVec;
-}
-
-Vector* Vector::createGpuVectorFromNumpy(float* data, int dim) {
-  CHECK_GT(dim, 0);
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create((size_t)dim, true);
-  retVec->m->vec->copyFrom(data, (size_t)dim);
-  return retVec;
-}
-
-void Vector::toNumpyArrayInplace(float** view_data,
-                                 int* dim1) throw(UnsupportError) {
-  auto v = std::dynamic_pointer_cast<paddle::CpuVector>(m->vec);
-  if (v != nullptr) {
-    *view_data = v->getData();
-    *dim1 = (int)v->getSize();
-  } else {
-    throw UnsupportError();
-  }
-}
-
-void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
-  *dim1 = m->vec->getSize();
-  *view_m_data = new float[*dim1];
-  if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
-    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
-  } else if (auto gpuVec = dynamic_cast<paddle::GpuVector*>(m->vec.get())) {
-    hl_memcpy_device2host(
-        *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
-  } else {
-    LOG(INFO) << "Unexpected situation";
-  }
-}
-
-void Vector::copyFromNumpyArray(float* data, int dim) {
-  m->vec->resize(dim);
-  m->vec->copyFrom(data, dim);
-}
-
-FloatArray Vector::getData() const {
-  if (this->isGpu()) {
-    float* src = m->vec->getData();
-    size_t len = m->vec->getSize();
-    float* dest = new float[len];
-    hl_memcpy_device2host(dest, src, len * sizeof(float));
-    FloatArray ret_val(dest, len);
-    ret_val.needFree = true;
-    return ret_val;
-  } else {
-    FloatArray ret_val(m->vec->getData(), m->vec->getSize());
-    return ret_val;
-  }
-}
-
-void Vector::copyFrom(Vector* src) throw(RangeError) {
-  if (src->m->vec->getSize() != m->vec->getSize()) {
-    throw RangeError();
-  }
-  m->vec->copyFrom(*src->m->vec);
-}
-
-bool Vector::isGpu() const {
-  return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
-}
-
-float Vector::get(const size_t idx) const throw(RangeError, UnsupportError) {
-  float r;
-  m->safeAccessData(idx, [&](float& o) { r = o; });
-  return r;
-}
-
-void Vector::set(const size_t idx, float val) throw(RangeError,
-                                                    UnsupportError) {
-  m->safeAccessData(idx, [&](float& o) { o = val; });
-}
-
-size_t Vector::getSize() const { return m->vec->getSize(); }
-
-void* Vector::getSharedPtr() { return &m->vec; }
diff --git a/paddle/capi/capi_private.h b/paddle/capi/capi_private.h
deleted file mode 100644
index 3332f42a4a6e57fed6ddb20cf7d759d67e7240b5..0000000000000000000000000000000000000000
--- a/paddle/capi/capi_private.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "capi.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Argument.h"
-#pragma once
-
-namespace paddle {
-namespace capi {
-
-enum CType { kIVECTOR = 0, kMATRIX, kARGUMENTS, kGRADIENT_MACHINE };
-
-#define STRUCT_HEADER CType type;
-
-struct CHeader {
-  STRUCT_HEADER
-};
-
-struct CIVector {
-  STRUCT_HEADER
-  IVectorPtr vec;
-
-  CIVector() : type(kIVECTOR) {}
-};
-
-struct CMatrix {
-  STRUCT_HEADER
-  MatrixPtr mat;
-
-  CMatrix() : type(kMATRIX) {}
-};
-
-struct CArguments {
-  STRUCT_HEADER
-  std::vector<paddle::Argument> args;
-
-  CArguments() : type(kARGUMENTS) {}
-
-  template <typename T>
-  paddle_error accessSeqPos(uint64_t ID, uint32_t nestedLevel, T callback) {
-    if (ID >= args.size()) return kPD_OUT_OF_RANGE;
-    switch (nestedLevel) {
-      case 0:
-        callback(args[ID].sequenceStartPositions);
-        break;
-      case 1:
-        callback(args[ID].subSequenceStartPositions);
-        break;
-      default:
-        return kPD_OUT_OF_RANGE;
-    }
-    return kPD_NO_ERROR;
-  }
-};
-
-struct CGradientMachine {
-  STRUCT_HEADER
-  paddle::GradientMachinePtr machine;
-
-  CGradientMachine() : type(kGRADIENT_MACHINE) {}
-};
-
-template <typename T>
-inline T* cast(void* ptr) {
-  return reinterpret_cast<T*>(ptr);
-}
-}  // namespace capi
-}  // namespace paddle
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
deleted file mode 100644
index 8c3f504e5a2d807c0cc664af486ebab4a82ddec3..0000000000000000000000000000000000000000
--- a/paddle/capi/gradient_machine.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gradient_machine.h"
-#include "capi_private.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-
-#define cast(v) paddle::capi::cast<paddle::capi::CGradientMachine>(v)
-
-enum GradientMatchineCreateMode {
-  CREATE_MODE_NORMAL = 0,
-  CREATE_MODE_TESTING = 4
-};
-
-namespace paddle {
-
-class MyNeuralNetwork : public NeuralNetwork {
- public:
-  MyNeuralNetwork(const std::string& name, NeuralNetwork* network)
-      : NeuralNetwork(name, network) {}
-};
-
-NeuralNetwork* newCustomNerualNetwork(const std::string& name,
-                                      NeuralNetwork* network) {
-  return new MyNeuralNetwork(name, network);
-}
-}  // namespace paddle
-
-extern "C" {
-paddle_error paddle_gradient_machine_create_for_inference(
-    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size) {
-  if (modelConfigProtobuf == nullptr) return kPD_NULLPTR;
-  paddle::ModelConfig config;
-  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
-      !config.IsInitialized()) {
-    return kPD_PROTOBUF_ERROR;
-  }
-
-  auto ptr = new paddle::capi::CGradientMachine();
-  ptr->machine.reset(paddle::GradientMachine::create(
-      config, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
-  *machine = ptr;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_create_for_inference_with_parameters(
-    paddle_gradient_machine* machine, void* mergedModel, uint64_t size) {
-  if (mergedModel == nullptr) return kPD_NULLPTR;
-  std::istringstream is(std::string(static_cast<char*>(mergedModel), size));
-  int64_t modelConfigSize = 0;
-  is.read((char*)(&modelConfigSize), sizeof(modelConfigSize));
-  std::string modelConfigProtobuf;
-  modelConfigProtobuf.resize(modelConfigSize);
-  is.read(&modelConfigProtobuf[0], modelConfigSize);
-  paddle::TrainerConfig config;
-  paddle::ModelConfig modelConfig;
-  if (!config.ParseFromString(modelConfigProtobuf) || !config.IsInitialized()) {
-    if (!modelConfig.ParseFromString(modelConfigProtobuf) ||
-        !modelConfig.IsInitialized()) {
-      return kPD_PROTOBUF_ERROR;
-    }
-  } else {
-    modelConfig = config.model_config();
-  }
-  auto ptr = new paddle::capi::CGradientMachine();
-  ptr->machine.reset(paddle::GradientMachine::create(
-      modelConfig, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
-  std::vector<paddle::ParameterPtr>& parameters = ptr->machine->getParameters();
-  for (auto& para : parameters) {
-    para->load(is);
-  }
-
-  *machine = ptr;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
-  delete cast(machine);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_load_parameter_from_disk(
-    paddle_gradient_machine machine, const char* path) {
-  auto m = cast(machine);
-  if (m == nullptr || path == nullptr || m->machine == nullptr)
-    return kPD_NULLPTR;
-  m->machine->loadParameters(path);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_forward(paddle_gradient_machine machine,
-                                             paddle_arguments inArgs,
-                                             paddle_arguments outArgs,
-                                             bool isTrain) {
-  auto m = cast(machine);
-  auto in = paddle::capi::cast<paddle::capi::CArguments>(inArgs);
-  auto out = paddle::capi::cast<paddle::capi::CArguments>(outArgs);
-  if (m == nullptr || in == nullptr || out == nullptr || m->machine == nullptr)
-    return kPD_NULLPTR;
-  m->machine->forward(
-      in->args, &out->args, isTrain ? paddle::PASS_TRAIN : paddle::PASS_TEST);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_create_shared_param(
-    paddle_gradient_machine origin,
-    void* modelConfigProtobuf,
-    int size,
-    paddle_gradient_machine* slave) {
-  auto o = cast(origin);
-  if (origin == nullptr || slave == nullptr || o->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-  paddle::ModelConfig config;
-  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
-      !config.IsInitialized()) {
-    return kPD_PROTOBUF_ERROR;
-  }
-
-  std::unique_ptr<paddle::capi::CGradientMachine> ptr(
-      new paddle::capi::CGradientMachine());
-  auto nn = paddle::NeuralNetwork::create(config);
-  nn->init(config,
-           [&o](int paramId, paddle::Parameter* param) {
-             auto p = o->machine->getParameters()[paramId];
-             param->enableSharedType(paddle::PARAMETER_VALUE,
-                                     p->getBuf(paddle::PARAMETER_VALUE));
-           },
-           {paddle::PARAMETER_VALUE},
-           false);
-  ptr->machine.reset(nn);
-  *slave = ptr.release();
-  return kPD_NO_ERROR;
-}
-}
-
-paddle_error paddle_gradient_machine_randomize_param(
-    paddle_gradient_machine machine) {
-  auto m = cast(machine);
-  if (m == nullptr || m->machine == nullptr) return kPD_NULLPTR;
-  m->machine->randParameters();
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_get_layer_output(
-    paddle_gradient_machine machine,
-    const char* layerName,
-    paddle_arguments args) {
-  auto m = cast(machine);
-  auto out = paddle::capi::cast<paddle::capi::CArguments>(args);
-  if (m == nullptr || layerName == nullptr || out == nullptr ||
-      m->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-
-  auto layerOutput = m->machine->getLayerOutput(layerName);
-  out->args.push_back(layerOutput);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_release_layer_output(
-    paddle_gradient_machine machine) {
-  auto m = cast(machine);
-  if (m == nullptr || m->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-  m->machine->releaseOutput();
-  return kPD_NO_ERROR;
-}
diff --git a/paddle/capi/tests/test_GradientMachine.cpp b/paddle/capi/tests/test_GradientMachine.cpp
deleted file mode 100644
index 73b9e477b2a2749250e878cf2174dcf4cc599be1..0000000000000000000000000000000000000000
--- a/paddle/capi/tests/test_GradientMachine.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/gserver/gradientmachines/GradientMachine.h>
-#include <paddle/trainer/TrainerConfigHelper.h>
-#include <stdlib.h>
-#include <string.h>
-#include <type_traits>
-#include "capi.h"
-#include "paddle/utils/ThreadLocal.h"
-
-static std::vector<paddle_real> randomBuffer(size_t bufSize) {
-  auto& eng = paddle::ThreadLocalRandomEngine::get();
-  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
-  std::vector<paddle_real> retv;
-  retv.reserve(bufSize);
-  for (size_t i = 0; i < bufSize; ++i) {
-    retv.push_back(dist(eng));
-  }
-  return retv;
-}
-
-TEST(GradientMachine, testPredict) {
-  //! TODO(yuyang18): Test GPU Code.
-  paddle::TrainerConfigHelper config("./test_predict_network.py");
-  std::string buffer;
-  ASSERT_TRUE(config.getModelConfig().SerializeToString(&buffer));
-  paddle_gradient_machine machine;
-
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_create_for_inference(
-                &machine, &buffer[0], (int)buffer.size()));
-  std::unique_ptr<paddle::GradientMachine> gm(
-      paddle::GradientMachine::create(config.getModelConfig()));
-  ASSERT_NE(nullptr, gm);
-  gm->randParameters();
-  gm->saveParameters("./");
-
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_load_parameter_from_disk(machine, "./"));
-
-  paddle_gradient_machine machineSlave;
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_create_shared_param(
-                machine, &buffer[0], (int)buffer.size(), &machineSlave));
-  std::swap(machineSlave, machine);
-  paddle_arguments outArgs = paddle_arguments_create_none();
-
-  paddle_arguments inArgs = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(inArgs, 1));
-  paddle_matrix mat = paddle_matrix_create(1, 100, false);
-  static_assert(std::is_same<paddle_real, paddle::real>::value, "");
-
-  auto data = randomBuffer(100);
-  paddle_real* rowPtr;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
-  memcpy(rowPtr, data.data(), data.size() * sizeof(paddle_real));
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(inArgs, 0, mat));
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_forward(machine, inArgs, outArgs, false));
-
-  uint64_t sz;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(outArgs, &sz));
-  ASSERT_EQ(1UL, sz);
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(outArgs, 0, mat));
-  std::vector<paddle::Argument> paddleInArgs;
-  std::vector<paddle::Argument> paddleOutArgs;
-  paddleInArgs.resize(1);
-  paddleInArgs[0].value =
-      paddle::Matrix::create(data.data(), 1, 100, false, false);
-
-  gm->forward(paddleInArgs, &paddleOutArgs, paddle::PASS_TEST);
-
-  auto matPaddle = paddleOutArgs[0].value;
-
-  uint64_t height, width;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
-  ASSERT_EQ(matPaddle->getHeight(), height);
-  ASSERT_EQ(matPaddle->getWidth(), width);
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
-  for (size_t i = 0; i < width; ++i) {
-    ASSERT_NEAR(matPaddle->getData()[i], rowPtr[i], 1e-5);
-  }
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(inArgs));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(outArgs));
-  std::swap(machineSlave, machine);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machineSlave));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machine));
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  std::vector<char*> argvs;
-  argvs.push_back(strdup("--use_gpu=false"));
-  paddle_init((int)argvs.size(), argvs.data());
-  for (auto each : argvs) {
-    free(each);
-  }
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 2cd6ab2bbf042bced41957193a0269f477eb10d0..a8bbb4eb8081420ae0bbaf761bd27303c0d043cb 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -46,6 +46,10 @@ cc_library(paddle_inference_api
     SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
+cc_library(paddle_inference_api_shared SHARED
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
 cc_test(test_paddle_inference_api
         SRCS test_paddle_inference_api.cc
         DEPS paddle_inference_api)
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
deleted file mode 100644
index 77f5d82dbe2cad183491033736bac85961b6d320..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_base.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstddef>
-
-#ifdef PADDLE_TYPE_DOUBLE
-#define HL_FLOAT_MAX 3.40282347e+38F
-#define HL_FLOAT_MIN 1.17549435e-38F
-using real = double;
-#else
-#define HL_FLOAT_MAX 1.7976931348623157e+308
-#define HL_FLOAT_MIN 2.2250738585072014e-308
-using real = float;
-#endif
-
-/**
- * The maximum input value for exp, used to avoid overflow problem.
- * currently only used for tanh function.
- */
-#define EXP_MAX_INPUT 40.0
-
-/**
- * @brief DIVUP(x, y) is similar to ceil(x / y).
- * @note  For CUDA, DIVUP will be used to specify
- *        the size of blockDim.
- */
-#ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y)-1) / (y))
-#endif
-
-/**
- * HPPL is an internal high performance parallel computing library
- * for high-level neural network routines, which can support many
- * heterogeneous compute architectures, such as GPU, FPGA, etc.
- */
-
-/**
- * @brief   HPPL CUDA Stream.
- *
- * @note    Each thread can use HPPL_STREAM_* after calling hl_init.
- *          HPPL_STREAM_DEFAULT is HPPL default stream.
- */
-typedef enum {
-  HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
-  HPPL_STREAM_1 = 1,
-  HPPL_STREAM_2 = 2,
-  HPPL_STREAM_3 = 3,
-  HPPL_STREAM_4 = 4,
-  HPPL_THREAD_STREAM_1 = 5,
-  HPPL_THREAD_STREAM_2 = 6,
-  HPPL_THREAD_STREAM_3 = 7,
-  HPPL_THREAD_STREAM_4 = 8,
-  HPPL_STREAM_END
-} hl_stream_t;
-
-/**
- * @brief HPPL activation mode.
- */
-typedef enum {
-  HL_ACTIVATION_SIGMOID = 0,
-  HL_ACTIVATION_RELU = 1,
-  HL_ACTIVATION_TANH = 2,
-  HL_ACTIVATION_LINEAR = 3,
-  HL_ACTIVATION_END
-} hl_activation_mode_t;
-
-/**
- * @brief Transpose type.
- */
-typedef enum {
-  HPPL_OP_N = 0, /* transpose */
-  HPPL_OP_T = 1, /* non transpose */
-  HPPL_OP_END
-} hl_trans_op_t;
-
-/**
- * @brief Lstm value.
- *
- * @param  gateValue         input value.
- * @param  prevStateValue    previous state value.
- * @param  stateValue        state value.
- * @param  stateActiveValue  state active value.
- * @param  outputValue       output value.
- */
-typedef struct {
-  real *gateValue;
-  real *prevStateValue;
-  real *stateValue;
-  real *stateActiveValue;
-  real *outputValue;
-  real *checkIg;
-  real *checkFg;
-  real *checkOg;
-} hl_lstm_value;
-
-/**
- * @brief Lstm gradient.
- *
- * @param  gateGrad          input gradient.
- * @param  prevStateGrad     previous state gradient.
- * @param  stateGrad         state gradient.
- * @param  stateActiveGrad   state active gradient.
- * @param  outputGrad        output gradient.
- */
-typedef struct {
-  real *gateGrad;
-  real *prevStateGrad;
-  real *stateGrad;
-  real *stateActiveGrad;
-  real *outputGrad;
-  real *checkIgGrad;
-  real *checkFgGrad;
-  real *checkOgGrad;
-} hl_lstm_grad;
-
-/**
- * @brief Gru value.
- *
- * @param  gateWeight           gate weight (updateGate + resetGate).
- * @param  stateWeight          frame state weight.
- * @param  gateValue            gate value results.
- * @param  resetOutputValue     resetOutput value.
- * @param  outputValue          output value.
- * @param  prevOutValue         previous output value.
- *
- */
-typedef struct {
-  real *gateWeight;
-  real *stateWeight;
-  real *gateValue;
-  real *resetOutputValue;
-  real *outputValue;
-  real *prevOutValue;
-} hl_gru_value;
-
-/**
- * @brief Gru gradient.
- *
- * @param  gateWeightGrad       gate weight gradient.
- * @param  stateWeightGrad      frame state weight gradient.
- * @param  gateGrad             gate gradient results.
- * @param  resetOutputGrad      resetOutput gradient.
- * @param  outputGrad           output gradient.
- * @param  prevOutGrad          previous output gradient.
- */
-typedef struct {
-  real *gateWeightGrad;
-  real *stateWeightGrad;
-  real *gateGrad;
-  real *resetOutputGrad;
-  real *outputGrad;
-  real *prevOutGrad;
-} hl_gru_grad;
-
-/**
- * @brief  Sparse matrix value type.
- */
-typedef enum {
-  HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
-  HL_FLOAT_VALUE = 1,
-  HL_VALUE_END
-} hl_matrix_value_t;
-
-/**
- * @brief  HPPL matrix format.
- */
-typedef enum {
-  HL_SPARSE_CSR = 0,
-  HL_SPARSE_CSC = 1,
-  HL_SPARSE_END
-} hl_matrix_format_t;
-
-typedef struct _hl_matrix_s *hl_matrix_s;
-
-/**
- * @brief   HPPL sparse matrix.
- *
- * @param  matrix     sparse matrix.
- * @param  format     matrix format.
- * @param  type       the type of matrix values.
- * @param  rows       matrix rows.
- * @param  cols       matrix columns.
- * @param  nnz        nonzero values of sparse matrix.
- */
-typedef struct {
-  hl_matrix_s matrix;
-  hl_matrix_format_t format;
-  hl_matrix_value_t type;
-  int rows;
-  int cols;
-  size_t nnz;
-} _hl_sparse_matrix_s, *hl_sparse_matrix_s;
-
-#ifdef __NVCC__
-
-#include <cuda_runtime.h>
-#include "paddle/cuda/include/hl_cuda.h"
-#include "paddle/utils/Logging.h"
-
-extern __thread bool g_sync_flag;
-extern __thread cudaStream_t default_stream;
-#define STREAM_DEFAULT default_stream
-
-/**
- * @brief   Check cuda kernel execution.
- * @param   msg   error string
- */
-#define CHECK_SYNC(msg)                                               \
-  if (true == g_sync_flag) {                                          \
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);                       \
-    cudaError_t err = (cudaError_t)hl_get_device_last_error();        \
-    CHECK_EQ(cudaSuccess, err)                                        \
-        << "[" << msg << "] "                                         \
-        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
-  }
-
-// __shfl has been deprecated as of CUDA 9.0.
-#if CUDA_VERSION < 9000
-template <typename T>
-__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
-  return __shfl_down(val, delta);
-}
-
-template <typename T>
-__forceinline__ __device__ T
-__shfl_sync(unsigned, T val, int src_line, int width) {
-  return __shfl(val, src_line, width);
-}
-
-#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
-#else
-#define FULL_WARP_MASK 0xFFFFFFFF
-#define CREATE_SHFL_MASK(mask, predicate) \
-  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-#endif
-
-#endif  // __NVCC__
diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu
deleted file mode 100644
index b17290557c4f635a963d88525409b9373b057a4b..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_top_k.cu
+++ /dev/null
@@ -1,481 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/cuda/include/hl_base.h"
-#include "paddle/cuda/include/hl_sparse.ph"
-#include "paddle/cuda/include/hl_top_k.h"
-#include "paddle/utils/Logging.h"
-
-// using namespace hppl;
-
-struct Pair {
-  __device__ __forceinline__ Pair() {}
-
-  __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
-
-  __device__ __forceinline__ void set(real value, int id) {
-    v_ = value;
-    id_ = id;
-  }
-
-  __device__ __forceinline__ void operator=(const Pair& in) {
-    v_ = in.v_;
-    id_ = in.id_;
-  }
-
-  __device__ __forceinline__ bool operator<(const real value) const {
-    return (v_ < value);
-  }
-
-  __device__ __forceinline__ bool operator<(const Pair& in) const {
-    return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
-  }
-
-  __device__ __forceinline__ bool operator>(const Pair& in) const {
-    return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
-  }
-
-  real v_;
-  int id_;
-};
-
-__device__ __forceinline__ void addTo(Pair topK[],
-                                      const Pair& p,
-                                      int beamSize) {
-  for (int k = beamSize - 2; k >= 0; k--) {
-    if (topK[k] < p) {
-      topK[k + 1] = topK[k];
-    } else {
-      topK[k + 1] = p;
-      return;
-    }
-  }
-  topK[0] = p;
-}
-
-template <int beamSize>
-__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
-  for (int k = beamSize - 2; k >= 0; k--) {
-    if (topK[k] < p) {
-      topK[k + 1] = topK[k];
-    } else {
-      topK[k + 1] = p;
-      return;
-    }
-  }
-  topK[0] = p;
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* src, int idx, int dim, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < src[idx]) {
-      Pair tmp(src[idx], idx);
-      addTo(topK, tmp, beamSize);
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < src[idx]) {
-      Pair tmp(src[idx], idx);
-      if (tmp < max) {
-        addTo(topK, tmp, beamSize);
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < val[idx]) {
-      Pair tmp(val[idx], col[idx]);
-      addTo(topK, tmp, beamSize);
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(Pair topK[],
-                                        real* val,
-                                        int* col,
-                                        int idx,
-                                        int dim,
-                                        const Pair& max,
-                                        int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < val[idx]) {
-      Pair tmp(val[idx], col[idx]);
-      if (tmp < max) {
-        addTo(topK, tmp, beamSize);
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void threadGetTopK(Pair topK[],
-                                              int& beam,
-                                              int beamSize,
-                                              real* src,
-                                              bool& firstStep,
-                                              bool& isEmpty,
-                                              Pair& max,
-                                              int dim,
-                                              const int tid) {
-  if (beam > 0) {
-    int length = beam < beamSize ? beam : beamSize;
-    if (firstStep) {
-      firstStep = false;
-      getTopK<blockSize>(topK, src, tid, dim, length);
-    } else {
-      for (int k = 0; k < maxLength; k++) {
-        if (k < maxLength - beam) {
-          topK[k] = topK[k + beam];
-        } else {
-          topK[k].set(-HL_FLOAT_MAX, -1);
-        }
-      }
-      if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
-      }
-    }
-
-    max = topK[maxLength - 1];
-    if (max.id_ == -1) isEmpty = true;
-    beam = 0;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void threadGetTopK(Pair topK[],
-                                              int& beam,
-                                              int beamSize,
-                                              real* val,
-                                              int* col,
-                                              bool& firstStep,
-                                              bool& isEmpty,
-                                              Pair& max,
-                                              int dim,
-                                              const int tid) {
-  if (beam > 0) {
-    int length = beam < beamSize ? beam : beamSize;
-    if (firstStep) {
-      firstStep = false;
-      getTopK<blockSize>(topK, val, col, tid, dim, length);
-    } else {
-      for (int k = 0; k < maxLength; k++) {
-        if (k < maxLength - beam) {
-          topK[k] = topK[k + beam];
-        } else {
-          topK[k].set(-HL_FLOAT_MAX, -1);
-        }
-      }
-      if (!isEmpty) {
-        getTopK<blockSize>(
-            topK + maxLength - beam, val, col, tid, dim, max, length);
-      }
-    }
-
-    max = topK[maxLength - 1];
-    if (max.id_ == -1) isEmpty = true;
-    beam = 0;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void blockReduce(Pair* shTopK,
-                                            int* maxId,
-                                            Pair topK[],
-                                            real** topVal,
-                                            int** topIds,
-                                            int& beam,
-                                            int& beamSize,
-                                            const int tid,
-                                            const int warp) {
-  while (true) {
-    __syncthreads();
-    if (tid < blockSize / 2) {
-      if (shTopK[tid] < shTopK[tid + blockSize / 2]) {
-        maxId[tid] = tid + blockSize / 2;
-      } else {
-        maxId[tid] = tid;
-      }
-    }
-    __syncthreads();
-    for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
-      if (tid < stride) {
-        if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
-          maxId[tid] = maxId[tid + stride];
-        }
-      }
-      __syncthreads();
-    }
-    __syncthreads();
-
-    if (tid == 0) {
-      **topVal = shTopK[maxId[0]].v_;
-      **topIds = shTopK[maxId[0]].id_;
-      (*topVal)++;
-      (*topIds)++;
-    }
-    if (tid == maxId[0]) beam++;
-    if (--beamSize == 0) break;
-    __syncthreads();
-
-    // NOTE(zcd): temporary solution
-    unsigned mask = 0u;
-    CREATE_SHFL_MASK(mask, true);
-
-    if (tid == maxId[0]) {
-      if (beam < maxLength) {
-        shTopK[tid] = topK[beam];
-      }
-    }
-    if (maxId[0] / 32 == warp) {
-      if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
-    }
-  }
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top maxLength value;
- * 2. merge to shTopK, block reduce and get max value;
- * 3. go to the second setp, until one thread's topK value is null;
- * 4. go to the first setp, until get the topK value.
- */
-template <int maxLength, int blockSize>
-__global__ void KeMatrixTopK(real* topVal,
-                             int ldv,
-                             int* topIds,
-                             real* src,
-                             int lds,
-                             int dim,
-                             int beamSize) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  src += blockIdx.x * lds;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-}
-
-template <int maxLength, int blockSize>
-__global__ void KeSMatrixTopK(real* topVal,
-                              int ldv,
-                              int* topIds,
-                              real* val,
-                              int* row,
-                              int* col,
-                              int beamSize) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-
-  int start = row[blockIdx.x];
-  int end = row[blockIdx.x + 1];
-  int dim = end - start;
-  val += start;
-  col += start;
-
-  if (beamSize > dim) {
-    // if the number of values to sort are less than the output size,
-    // use -1 to indicate the end of valid sorted values.
-    if (tid == 0) {
-      topIds[dim] = -1;
-    }
-
-    beamSize = dim;
-  }
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-}
-
-void hl_matrix_top_k(real* topVal,
-                     int ldv,
-                     int* topIds,
-                     real* src,
-                     int lds,
-                     int dim,
-                     int beamSize,
-                     int numSamples) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-
-  if (beamSize > dim) beamSize = dim;
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, src, lds, dim, beamSize);
-
-  CHECK_SYNC("hl_matrix_top_k failed");
-}
-
-void hl_sparse_matrix_top_k(real* topVal,
-                            int ldv,
-                            int* topIds,
-                            hl_sparse_matrix_s src,
-                            int beamSize,
-                            int numSamples) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-  CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
-
-  hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-  if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
-    LOG(FATAL) << "parameter src is null!";
-  }
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
-
-  CHECK_SYNC("hl_sparse_matrix_top_k failed");
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top maxLength value;
- * 2. merge to shTopK, block reduce and get max value;
- * 3. go to the second setp, until one thread's topK value is null;
- * 4. go to the first setp, until get the topK value.
- */
-template <int maxLength, int blockSize>
-__global__ void KeMatrixTopKClassificationError(real* topVal,
-                                                int ldv,
-                                                int* topIds,
-                                                real* src,
-                                                int lds,
-                                                int dim,
-                                                int beamSize,
-                                                int* label,
-                                                real* recResult) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  src += blockIdx.x * lds;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-  int topkSize = beamSize;
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-
-  __syncthreads();
-  if (tid == 0) {
-    for (int i = 0; i < topkSize; i++) {
-      if (*--topIds == label[blockIdx.x]) {
-        recResult[blockIdx.x] = 0;
-        break;
-      }
-      recResult[blockIdx.x] = 1.0f;
-    }
-  }
-}
-
-void hl_matrix_classification_error(real* topVal,
-                                    int ldv,
-                                    int* topIds,
-                                    real* src,
-                                    int lds,
-                                    int dim,
-                                    int topkSize,
-                                    int numSamples,
-                                    int* label,
-                                    real* recResult) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-
-  if (topkSize > dim) topkSize = dim;
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
-
-  CHECK_SYNC("hl_matrix_top_k classification error failed");
-}
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index bc48fd3b479157d4aea390cd5f4dc61ea46dca4b..cd00b7de7338982308acfa1f1e8c38e010c6a43b 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -147,9 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                  "Input tensor type is not supported: ", in.type().name());
   memory::data_type out_type = in_type;
 
-  auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format());
+  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
   auto out_format =
-      MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
+      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
 
   void* in_data = GetDataFromTensor(in, in_type);
 
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 67f91e4e48d3e11ed493c5e6943cb9071aff60c4..90bb206ec6b698bc23ad1a5c9609a25186ec6de8 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -62,12 +62,6 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
   return MKLDNNDataType::data_undef;
 }
 
-inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size,
-                                        MKLDNNFormat default_format) {
-  return (dims_size == 1
-              ? mkldnn::memory::format::x
-              : dims_size == 2 ? mkldnn::memory::format::nc : default_format);
-}
 #endif
 
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 5f15e20c78fd5a333523fe9e73542c037a161cae..82872224501709080ff02a13464d58543a0abda8 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -18,17 +18,21 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
-static void PassTensorData(Tensor* from, Tensor* to) {
+static void PassTensorData(Tensor *from, Tensor *to) {
   to->ShareDataWith(*from);
   *from = Tensor();
 }
 
-void DataTransform(const OpKernelType& expected_kernel_type,
-                   const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* output_tensor) {
+void TransformData(const OpKernelType &expected_kernel_type,
+                   const OpKernelType &kernel_type_for_var,
+                   const Tensor &input_tensor, Tensor *output_tensor) {
   bool transformed = false;
   Tensor in;
   in.ShareDataWith(input_tensor);
@@ -48,8 +52,8 @@ void DataTransform(const OpKernelType& expected_kernel_type,
         // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
         // Just set layout/format. No real transform occur
 
-        auto out_format =
-            MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin));
+        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
+                                                        ToMKLDNNFormat(lin));
 
         out.ShareDataWith(input_tensor);
         out.set_layout(DataLayout::kMKLDNN);
@@ -89,17 +93,17 @@ void DataTransform(const OpKernelType& expected_kernel_type,
   output_tensor->ShareDataWith(in);
 }
 
-void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable* out_var) {
+void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
+                         Variable *out_var) {
   if (in_var.IsType<LoDTensor>()) {
-    auto& in_lod_tensor = in_var.Get<LoDTensor>();
-    auto* tran_lod_tensor = out_var->GetMutable<LoDTensor>();
+    auto &in_lod_tensor = in_var.Get<LoDTensor>();
+    auto *tran_lod_tensor = out_var->GetMutable<LoDTensor>();
     tran_lod_tensor->set_lod(in_lod_tensor.lod());
     tran_lod_tensor->set_layout(in_lod_tensor.layout());
     tran_lod_tensor->ShareDataWith(tensor);
   } else if (in_var.IsType<SelectedRows>()) {
-    auto& in_selected_rows = in_var.Get<SelectedRows>();
-    auto* trans_selected_rows = out_var->GetMutable<SelectedRows>();
+    auto &in_selected_rows = in_var.Get<SelectedRows>();
+    auto *trans_selected_rows = out_var->GetMutable<SelectedRows>();
     trans_selected_rows->set_height(in_selected_rows.height());
     trans_selected_rows->set_rows(in_selected_rows.rows());
     trans_selected_rows->mutable_value()->ShareDataWith(tensor);
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
index dee5d8c7c1126013742460df1d94bb364220ad09..ae3ab051bda2e698801cc6fe6e3ddddf039f5385 100644
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -30,12 +30,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void DataTransform(const OpKernelType& expected_kernel_type,
-                   const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* out);
-
-void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable* out_var);
+void TransformData(const OpKernelType &expected_kernel_type,
+                   const OpKernelType &kernel_type_for_var,
+                   const Tensor &input_tensor, Tensor *out);
+
+/**
+ * Set OutVar from InVar, except the tensor is shared with `tensor`
+ */
+void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
+                         Variable *out_var);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 3c73b6cc55c187c3f6e7edd1ce38cc58f4e8413d..4fb4ec38ee965a2790d11378a1ce6befa0ef5a00 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -25,11 +25,12 @@ else()
     cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 endif()
 
+cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
 
 
 cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 64e83acb4dc1995800c4ca3caf81668b24a7c9fe..9c2c845c6efb206fb1ad5150189430b9a6fe9ea3 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -33,6 +33,8 @@ struct BuildStrategy {
   GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
 
   std::string debug_graphviz_path_{""};
+
+  bool enable_data_balance_{true};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b914851fe0add74f6d85589f4686224b668b8064
--- /dev/null
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/data_balance_op_handle.h"
+#include <algorithm>
+#include "paddle/fluid/framework/details/container_cast.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+#ifdef PADDLE_WITH_CUDA
+DataBalanceOpHandle::DataBalanceOpHandle(
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    const platform::NCCLContextMap *ctxs)
+    : local_scopes_(local_scopes), places_(places) {
+  if (ctxs) {
+    for (auto &p : places_) {
+      this->dev_ctxes_[p] = ctxs->DevCtx(p);
+    }
+  }
+}
+#else
+DataBalanceOpHandle::DataBalanceOpHandle(
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+#endif
+
+std::string DataBalanceOpHandle::Name() const { return "data balance"; }
+
+std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
+    const std::vector<int> &device_sizes) {
+  int device_num = device_sizes.size();
+  int total_size = 0;
+  int empty_num = 0;
+  std::vector<std::array<int, 2>> size_device_vec;
+  size_device_vec.reserve(device_num);
+  for (int i = 0; i < device_num; ++i) {
+    if (device_sizes[i] == 0) {
+      ++empty_num;
+    }
+    total_size += device_sizes[i];
+    size_device_vec.push_back({{device_sizes[i], i}});
+  }
+  std::vector<std::array<int, 3>> res;
+  if (empty_num == 0) {
+    // No need to do data balance.
+    return res;
+  }
+  if (total_size < device_num) {
+    // No enough data.
+    PADDLE_THROW("There is no next data.");
+  }
+  std::sort(size_device_vec.begin(), size_device_vec.end(),
+            [](const std::array<int, 2> &a, const std::array<int, 2> &b) {
+              return a[0] > b[0];
+            });
+  int expected_device_size = total_size / device_num;
+  int src_idx = 0;
+  for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) {
+    if (size_device_vec[src_idx][0] <= expected_device_size) {
+      ++src_idx;
+      PADDLE_ENFORCE_LT(
+          src_idx, device_num - empty_num,
+          "In current srategy an empty tensor should not be copy source.");
+    }
+    size_device_vec[src_idx][0] -= expected_device_size;
+    size_device_vec[dst_idx][0] += expected_device_size;
+    res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1],
+                    expected_device_size}});
+  }
+  return res;
+}
+
+void DataBalanceOpHandle::RunImpl() {
+  if (places_.size() == 1) {
+    return;
+  }
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+  PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+  int data_num = in_var_handles.size() / places_.size();
+  WaitInputVarGenerated();
+  std::vector<std::vector<LoDTensor *>> lod_tensors(data_num);
+  std::vector<int> device_sizes;
+  for (int i = 0; i < static_cast<int>(in_var_handles.size()); ++i) {
+    PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
+                      "The name of input and output should be equal.");
+    int place_idx = i / data_num;
+    int data_idx = i % data_num;
+    auto *local_scope =
+        local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name_);
+    PADDLE_ENFORCE(tensor_var->IsType<LoDTensor>());
+    auto *tensor = tensor_var->GetMutable<LoDTensor>();
+    lod_tensors[data_idx].push_back(tensor);
+    int ins_size =
+        tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements();
+    if (data_idx == 0) {
+      device_sizes.emplace_back(ins_size);
+    } else {
+      PADDLE_ENFORCE_EQ(
+          ins_size, device_sizes.at(place_idx),
+          "All data on the same device shall have the same batch size.");
+    }
+  }
+  const auto &balance_plan = GetBalancePlan(device_sizes);
+
+  for (const auto &trans : balance_plan) {
+    for (int data_idx = 0; data_idx < data_num; ++data_idx) {
+      LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]];
+      LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]];
+      int trans_ins_size = trans[2];
+      LoD src_lod = src_tensor->lod();
+      int src_ins_size =
+          src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements();
+      int cut_point = src_ins_size - trans_ins_size;
+      if (!src_lod.empty()) {
+        for (auto &level : src_lod) {
+          cut_point = level[cut_point];
+        }
+      }
+      TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]),
+                     dst_tensor->place(), dst_tensor);
+      src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point));
+      if (!src_lod.empty()) {
+        dst_tensor->set_lod(SliceInLevel(
+            src_lod, 0, src_ins_size - trans_ins_size, src_ins_size));
+        src_tensor->set_lod(
+            SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size));
+      }
+    }
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..76a407e3610e8bb48facf1f814779f4c23f92d98
--- /dev/null
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct DataBalanceOpHandle : public OpHandleBase {
+ public:
+#ifdef PADDLE_WITH_CUDA
+  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+                      const std::vector<platform::Place> &places,
+                      const platform::NCCLContextMap *ctxs);
+#else
+  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+                      const std::vector<platform::Place> &places);
+#endif
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  // std::vector<(src_dev_id, dst_dev_id, trans_size)>
+  std::vector<std::array<int, 3>> GetBalancePlan(
+      const std::vector<int> &batch_size_per_device);
+
+  const std::vector<Scope *> local_scopes_;
+  const std::vector<platform::Place> places_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 224e8e1f6efd7a894591ac51c929517cae7539ce..d646c944601e81477787740189d7ac60ae97fa80 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -67,8 +67,8 @@ void FetchOpHandle::RunImpl() {
 #endif
     } else {
       tensors_[i].ShareDataWith(t);
-      tensors_[i].set_lod(t.lod());
     }
+    tensors_[i].set_lod(t.lod());
   }
 
   this->WaitAndMergeCPUTensors();
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index cc7b94d0653e34c8ac711a7db7ab6ab1a9ac46a2..46d0c2769cb334f5cb75ae0ef5e48da45448c48f 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/data_balance_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
@@ -215,7 +216,14 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       } else {
         // This op runs on all devices, and its output may have parameter's
         // gradients.
-        CreateComputationalOps(&result, *op, places_.size());
+        if (op->Type() == "read" && strategy_.enable_data_balance_) {
+          op->SetAttr("throw_eof_exp", false);
+          CreateComputationalOps(&result, *op, places_.size());
+          const auto &data_var_names = op->Output("Out");
+          InsertDataBalanceOp(&result, data_var_names);
+        } else {
+          CreateComputationalOps(&result, *op, places_.size());
+        }
 
         if (!is_forwarding && places_.size() > 1) {
           // Currently, we assume that once gradient is generated, it can be
@@ -360,6 +368,29 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
   }
 }
 
+void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
+    SSAGraph *result, const std::vector<std::string> &datas) const {
+#ifdef PADDLE_WITH_CUDA
+  result->ops_.emplace_back(
+      new DataBalanceOpHandle(local_scopes_, places_, nccl_ctxs_));
+#else
+  result->ops_.emplace_back(new DataBalanceOpHandle(local_scopes_, places_));
+#endif
+  auto *op_handle = result->ops_.back().get();
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    for (const std::string &d_name : datas) {
+      auto &vars = result->vars_[i][d_name];
+      PADDLE_ENFORCE(!vars.empty());
+      op_handle->AddInput(vars.back().get());
+      auto var = new VarHandle(vars.size(), i, d_name, p);
+      vars.emplace_back(var);
+      op_handle->AddOutput(var);
+    }
+  }
+}
+
 bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
     const std::string &og,
     std::unordered_set<std::string> *og_has_been_broadcast) const {
@@ -512,7 +543,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
     op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
     // the variable name which contains .block means it was splited by
     // split_byref op
-    // so that we can balance the variable blocks to all the pserver instances.
+    // so that we can balance the variable blocks to all the pserver
+    // instances.
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
         op.InputArgumentNames()[0].find(".block") == std::string::npos) {
       op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 0b6347bf51dc1c347073a0fdcf4ddd91865d846d..a964e024885e56693224a6199e00ff30beaa1df4 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -101,6 +101,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 
   void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
 
+  void InsertDataBalanceOp(SSAGraph *result,
+                           const std::vector<std::string> &datas) const;
+
   void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 1f84c3b9e2d7ee9ae51959988fceeb3451b7b3b8..3560fabb424375a770432586fe7c8e51210b3d0c 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -58,8 +58,10 @@ void OpHandleBase::Run(bool use_cuda) {
 
 void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
 #ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_NOT_NULL(waited_ctx);
   if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
     for (auto &dev_ctx : dev_ctxes_) {
+      PADDLE_ENFORCE_NOT_NULL(dev_ctx.second);
       dev_ctx.second->Wait();
     }
   } else {
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index ae98fccc9600a2a75f12fa516c982bec0ef13f9f..84f67fafa19ac545ebb7a1019059e3c74c363c56 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -20,9 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/grpc_client.h"
-#endif
+#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -48,10 +46,16 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 #ifdef PADDLE_WITH_DISTRIBUTE
-void Executor::Complete() {
+void Executor::BeginPass() {
+  ::paddle::operators::distributed::RPCClient::GetInstance<
+      ::paddle::operators::distributed::GRPCClient>()
+      ->SendBeginPass();
+}
+
+void Executor::EndPass() {
   ::paddle::operators::distributed::RPCClient::GetInstance<
       ::paddle::operators::distributed::GRPCClient>()
-      ->SendComplete();
+      ->SendEndPass();
 }
 #endif
 
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 3aa5ffef69cd29681f248e915575c5715ad0d3fa..563a4b2bb65dad481a755f67c7f23939816ce8e8 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -46,9 +46,14 @@ class Executor {
 
 #ifdef PADDLE_WITH_DISTRIBUTE
   /*
-   * Sending signal to pserver to mark current trainer stop.
+   * Sending signal to pserver to mark current pass started.
    */
-  void Complete();
+  void BeginPass();
+
+  /*
+   * Sending signal to pserver to mark current pass finished.
+   */
+  void EndPass();
 #endif
 
   /* @Brief
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index d29d8ce1c561e45980d10c17c984ca2ed3b453f3..cba0064f38f89c1dd27cfac1ddb2339a5ee6c93f 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/var_type.h"
 
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
@@ -68,9 +69,9 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
   // only print first ten elements
   int64_t size = t.numel() < 10 ? t.numel() : 10;
   for (int64_t i = 0; i < size; ++i) {
-    if (t.type().hash_code() == typeid(float).hash_code()) {
+    if (IsType<float>(t.type())) {
       os << t.data<float>()[i] << " ";
-    } else if (t.type().hash_code() == typeid(int64_t).hash_code()) {
+    } else if (IsType<int64_t>(t.type())) {
       os << t.data<int64_t>()[i] << " ";
     } else {
       PADDLE_THROW("LoDTensor data type not in [float, int64_t]");
@@ -89,6 +90,7 @@ std::string LoDToString(const LoD &lod) {
 LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                  size_t elem_end) {
   PADDLE_ENFORCE_LT(level, in.size());
+  PADDLE_ENFORCE_LT(elem_begin, elem_end);
   PADDLE_ENFORCE_LT(elem_end, in[level].size());
 
   LoD res;
@@ -384,7 +386,7 @@ void LoDTensor::MergeLoDTensor(
   LoD new_lod = lod_tensors[0]->lod();
   for (size_t i = 1; i < lod_tensors.size(); ++i) {
     auto *t = lod_tensors[i];
-    PADDLE_ENFORCE_EQ(new_type.hash_code(), t->type().hash_code());
+    PADDLE_ENFORCE_EQ(new_type, t->type());
     PADDLE_ENFORCE_EQ(new_layout, t->layout());
 
     PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0],
@@ -392,6 +394,7 @@ void LoDTensor::MergeLoDTensor(
     new_dim[0] += t->dims()[0];
 
     auto &lod = t->lod();
+    PADDLE_ENFORCE_EQ(new_lod.size(), lod.size());
     for (size_t j = 0; j < lod.size(); ++j) {
       auto &sub_lod = new_lod[j];
       auto &offset = sub_lod.back();
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index f51a184e7bae2283f335fe9462a77b9c5fb831a5..c59b232191c49ccb47bb9f51dcaf2fd9280fae19 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -97,7 +97,7 @@ inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
   return ret;
 }
 
-inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
+inline bool NeedTransform(const OpKernelType& l, const OpKernelType& r) {
   return (!platform::places_are_same_class(l.place_, r.place_)) ||
          (l.data_type_ != r.data_type_) ||
          NeedTransformLayout(l.data_layout_, r.data_layout_);
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 43ab227a9478707445892c14723801992d0041aa..3314e41cc51d74f87be0e2cd5eba9bb260c16be7 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -76,6 +76,20 @@ class OpRegistry {
 template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
 struct OpKernelRegistrarFunctor;
 
+template <typename PlaceType, typename T, typename Func>
+inline void RegisterKernelClass(const char* op_type, const char* library_type,
+                                Func func) {
+  std::string library(library_type);
+  std::string data_layout = "ANYLAYOUT";
+  if (library == "MKLDNN") {
+    data_layout = "MKLDNNLAYOUT";
+  }
+  OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
+                   StringToDataLayout(data_layout),
+                   StringToLibraryType(library_type));
+  OperatorWithKernel::AllOpKernels()[op_type][key] = func;
+}
+
 template <typename PlaceType, size_t I, typename... KernelTypes>
 struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
   using KERNEL_TYPE =
@@ -83,16 +97,10 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
 
   void operator()(const char* op_type, const char* library_type) const {
     using T = typename KERNEL_TYPE::ELEMENT_TYPE;
-    std::string library(library_type);
-    std::string data_layout = "ANYLAYOUT";
-    if (library == "MKLDNN") {
-      data_layout = "MKLDNNLAYOUT";
-    }
-    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
-                     StringToDataLayout(data_layout),
-                     StringToLibraryType(library_type));
-    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
-
+    RegisterKernelClass<PlaceType, T>(
+        op_type, library_type, [](const framework::ExecutionContext& ctx) {
+          KERNEL_TYPE().Compute(ctx);
+        });
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
     OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
         func;
@@ -116,6 +124,47 @@ class OpKernelRegistrar : public Registrar {
   }
 };
 
+template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctorEx;
+
+template <typename PlaceType, typename... DataTypeAndKernelType>
+class OpKernelRegistrarEx : public Registrar {
+ public:
+  explicit OpKernelRegistrarEx(const char* op_type, const char* library_type) {
+    OpKernelRegistrarFunctorEx<PlaceType, false, 0, DataTypeAndKernelType...>
+        func;
+    func(op_type, library_type);
+  }
+};
+
+template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
+struct OpKernelRegistrarFunctorEx<PlaceType, true, I,
+                                  DataTypeAndKernelType...> {
+  void operator()(const char* op_type, const char* library_type) const {}
+};
+
+template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
+struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+                                  DataTypeAndKernelType...> {
+  using Functor =
+      typename std::tuple_element<I + 1,
+                                  std::tuple<DataTypeAndKernelType...>>::type;
+  using T =
+      typename std::tuple_element<I,
+                                  std::tuple<DataTypeAndKernelType...>>::type;
+
+  void operator()(const char* op_type, const char* library_type) const {
+    RegisterKernelClass<PlaceType, T>(op_type, library_type, Functor());
+
+    constexpr auto size =
+        std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
+    OpKernelRegistrarFunctorEx<PlaceType, I + 2 >= size, I + 2,
+                               DataTypeAndKernelType...>
+        func;
+    func(op_type, library_type);
+  }
+};
+
 /**
  * check if MACRO is used in GLOBAL NAMESPACE.
  */
@@ -174,6 +223,25 @@ class OpKernelRegistrar : public Registrar {
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
 
+#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, ...)      \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+      __reg_op_kernel_##op_type##_##library_type##__,                       \
+      "REGISTER_OP_KERNEL_EX must be called in global namespace");          \
+  static ::paddle::framework::OpKernelRegistrarEx<place_class, __VA_ARGS__> \
+      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,        \
+                                                           #library_type);  \
+  int TouchOpKernelRegistrar_##op_type##_##library_type() {                 \
+    __op_kernel_registrar_##op_type##_##library_type##__.Touch();           \
+    return 0;                                                               \
+  }
+
+#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...)                 \
+  REGISTER_OP_KERNEL_EX(op_type, CUDA, ::paddle::platform::CUDAPlace, \
+                        __VA_ARGS__)
+
+#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \
+  REGISTER_OP_KERNEL_EX(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+
 /**
  * Macro to mark what Operator and Kernel
  * we will use and tell the compiler to
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index c1329b06d7e9bcd6604fed14cefa305339c5c4b8..3cf8e8696d739e3f2894e490161b9fb5b459bc41 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -592,8 +592,7 @@ static void CheckTensorNANOrInf(const std::string& name,
   if (tensor.memory_size() == 0) {
     return;
   }
-  if (tensor.type().hash_code() != typeid(float).hash_code() &&   // NOLINT
-      tensor.type().hash_code() != typeid(double).hash_code()) {  // NOLINT
+  if (!IsType<float>(tensor.type()) && !IsType<double>(tensor.type())) {
     return;
   }
   PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
@@ -620,8 +619,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
         "There are no kernels which are registered in the %s operator.", type_);
   }
 
-  ExecutionContext ctx(*this, scope, *dev_ctx);
-
   OpKernelMap& kernels = kernels_iter->second;
 
   // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
@@ -631,7 +628,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   //   Do selection
   // }
 
-  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+  auto expected_kernel_key =
+      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
@@ -640,56 +638,34 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                  KernelTypeToString(expected_kernel_key));
   }
 
-  // do data transform
-  Scope& new_scope = scope.NewScope();
+  // do data transformScope &transfer_scope;
+  std::vector<std::string> transfered_inplace_vars;
+  auto* transfer_scope =
+      TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
 
-  std::vector<std::string> inplace_vars;
-  for (auto& var_name_item : this->Inputs()) {
-    for (auto& var_name : var_name_item.second) {
-      auto* var = scope.FindVar(var_name);
-      if (var && VarIsTensor(var)) {
-        auto* tensor_in = GetTensorFromVar(var);
-        if (tensor_in->IsInitialized()) {
-          auto kernel_type_for_var = this->GetKernelTypeForVar(
-              var_name_item.first, *tensor_in, expected_kernel_key);
-          if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) {
-            auto out_var_names = OutputVars(true);
-            if (std::find(out_var_names.begin(), out_var_names.end(),
-                          var_name) != out_var_names.end()) {
-              inplace_vars.push_back(var_name);
-            }
-            VLOG(3) << "Transform Variable " << var_name << " from "
-                    << kernel_type_for_var << " to " << expected_kernel_key;
-            auto* trans_var = new_scope.Var(var_name);
-            std::shared_ptr<Tensor> out(new Tensor);
-            DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in,
-                          out.get());
-            CopyVariableWithTensor(*var, *(out.get()), trans_var);
-          }
-        }
-      }
-    }
+  // exec scope is the scope that kernel actually executed on.
+  const Scope& exec_scope =
+      (transfer_scope == nullptr ? scope : *transfer_scope);
+
+  if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
+    dev_ctx = pool.Get(expected_kernel_key.place_);
   }
 
-  auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
-  kernel_iter->second->Compute(
-      ExecutionContext(*this, new_scope, *new_dev_ctx));
+  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
 
-  for (auto& var_name : inplace_vars) {
-    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
-    auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
-    auto* transformed_tensor = GetTensorFromVar(new_scope.FindVar(var_name));
-    original_tensor->ShareDataWith(*transformed_tensor);
+  if (!transfered_inplace_vars.empty()) {
+    // there is inplace variable has been transfered.
+    TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
   }
 
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
-    new_dev_ctx->Wait();
+    dev_ctx->Wait();
   }
 
   if (FLAGS_check_nan_inf) {
     for (auto& vname : OutputVars(true)) {
-      auto* var = new_scope.FindVar(vname);
+      auto* var = exec_scope.FindVar(vname);
       if (var == nullptr) continue;
       if (var->IsType<framework::LoDTensor>()) {
         CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
@@ -697,6 +673,64 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     }
   }
 }
+void OperatorWithKernel::TransferInplaceVarsBack(
+    const Scope& scope, const std::vector<std::string>& inplace_vars,
+    const Scope& transfer_scope) const {
+  for (auto& var_name : inplace_vars) {
+    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
+    auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
+    auto* transformed_tensor =
+        GetTensorFromVar(transfer_scope.FindVar(var_name));
+    original_tensor->ShareDataWith(*transformed_tensor);
+  }
+}
+
+Scope* OperatorWithKernel::TryTransferData(
+    const Scope& scope, const OpKernelType& expected_kernel_key,
+    std::vector<std::string>* transfered_inplace_vars) const {
+  Scope* new_scope = nullptr;
+  for (auto& var_name_item : Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope.FindVar(var_name);
+      // Only tensor can be tranfer to another device.
+      if (var == nullptr || !VarIsTensor(var)) {
+        continue;
+      }
+
+      auto* tensor_in = GetTensorFromVar(var);
+      if (!tensor_in->IsInitialized()) {
+        continue;
+      }
+
+      auto kernel_type_for_var = GetKernelTypeForVar(
+          var_name_item.first, *tensor_in, expected_kernel_key);
+
+      if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
+        continue;
+      }
+
+      auto out_var_names = OutputVars(true);
+      if (std::find(out_var_names.begin(), out_var_names.end(), var_name) !=
+          out_var_names.end()) {
+        transfered_inplace_vars->emplace_back(var_name);
+      }
+
+      VLOG(3) << "Transform Variable " << var_name << " from "
+              << kernel_type_for_var << " to " << expected_kernel_key;
+
+      if (new_scope == nullptr) {
+        new_scope = &scope.NewScope();
+      }
+
+      auto* trans_var = new_scope->Var(var_name);
+      Tensor out;
+      TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
+      SetTensorToVariable(*var, out, trans_var);
+    }
+  }
+
+  return new_scope;
+}
 
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
@@ -713,10 +747,6 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &var->Get<LoDTensor>();
         } else if (var->IsType<SelectedRows>()) {
           t = &(var->Get<SelectedRows>().value());
-        } else if (var->IsType<LoDTensorArray>()) {
-          const LoDTensorArray& arr = var->Get<LoDTensorArray>();
-          PADDLE_ENFORCE(arr.size() > 0);
-          t = &(arr[0]);
         }
         if (t != nullptr) {
           int tmp = static_cast<int>(ToDataType(t->type()));
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index b1d75d0d0ff3dccc67a1e833ccfe03a4cad8df39..01d750efbb8aaa35701f6caa7ec103ec21dd529e 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -347,9 +347,9 @@ class OpKernel : public OpKernelBase {
 
 class OperatorWithKernel : public OperatorBase {
  public:
+  using OpKernelFunc = std::function<void(const ExecutionContext&)>;
   using OpKernelMap =
-      std::unordered_map<OpKernelType, std::unique_ptr<OpKernelBase>,
-                         OpKernelType::Hash>;
+      std::unordered_map<OpKernelType, OpKernelFunc, OpKernelType::Hash>;
 
   OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
                      const VariableNameMap& outputs, const AttributeMap& attrs)
@@ -384,6 +384,20 @@ class OperatorWithKernel : public OperatorBase {
   // same.
   proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
+
+  /**
+   * Transfer data from scope to a transfered scope. If there is no data need to
+   * be tranfered, it returns nullptr.
+   *
+   * * transfered_inplace_vars is a output vector.
+   */
+  Scope* TryTransferData(
+      const Scope& scope, const OpKernelType& expected_kernel_key,
+      std::vector<std::string>* transfered_inplace_vars) const;
+
+  void TransferInplaceVarsBack(const Scope& scope,
+                               const std::vector<std::string>& inplace_vars,
+                               const Scope& exec_scope) const;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 751b10eeeed10828c08ada4173300c07f81c093e..b53a6f43fbd1f23e69d23ad0fcc54d5c25d352a3 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -253,9 +253,6 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
       t->set_lod(lod_tensors[j].lod());
     }
   }
-  for (auto &p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
 }
 
 ParallelExecutor::~ParallelExecutor() {
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 96114678a9992f2975c4173c7cc003114f04d8df..7f678f869aac4616c8bca440d0431f765da41dd6 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -23,9 +23,9 @@ namespace framework {
 template <typename T>
 inline const T* Tensor::data() const {
   check_memory_size();
-  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type() == std::type_index(typeid(T)),
-                 "Tensor holds the wrong type, it holds %s",
+  bool valid = std::is_same<T, void>::value ||
+               holder_->type() == std::type_index(typeid(T));
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
 
   return reinterpret_cast<const T*>(
@@ -37,9 +37,9 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
 template <typename T>
 inline T* Tensor::data() {
   check_memory_size();
-  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type() == std::type_index(typeid(T)),
-                 "Tensor holds the wrong type, it holds %s",
+  bool valid = std::is_same<T, void>::value ||
+               holder_->type() == std::type_index(typeid(T));
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index e5bc74755f46449296a153e8b330968e6d9f1e1d..f98011e896f4033ef210e0eb69f93ce7800a3cd6 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -69,7 +69,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+    if (platform::is_same_place(src_place, dst_place)) {
+      memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                   stream);
+    } else {
+      if (platform::is_same_place(ctx_place, src_place)) {
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+      } else if (platform::is_same_place(ctx_place, dst_place)) {
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+      } else {
+        PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
+      }
+    }
   }
 #endif
 }
@@ -78,10 +93,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
-  if (platform::is_gpu_place(src.place())) {
-    dev_ctx = pool.Get(src.place());
-  } else {
+  if (platform::is_gpu_place(dst_place)) {
     dev_ctx = pool.Get(dst_place);
+  } else {
+    dev_ctx = pool.Get(src.place());
   }
   TensorCopy(src, dst_place, *dev_ctx, dst);
 }
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index dca279b69382b80e055f661cefe84b81326704b5..4457382ade37a12f5f3613fc4113fbf1f6f91124 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -23,10 +23,25 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
+// and dst_place are two different GPU, to ensure that the operation can
+// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
+// If ctx_place and src_place are the same, src_ctx.Wait() is added
+// after memory::Copy; if ctx_place and dst_place are the same,
+// src_ctx.Wait() is added before memory::Copy.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst);
+
+// NOTE(zcd): If the src.place() and dst_place are two different GPU,
+// the copy operation is carried out on the dst_place's stream. This is
+// very important, because TensorCopy is an async operator, and in most
+// case, once this copy operator returns, dst is to be used in dst_place's
+// stream, if this copy operation is carried out on the src_place's stream,
+// when dst is used in dst_place's stream the copy operation may be
+// not completed.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst);
+
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                     Tensor* dst);
 
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index 2b646d78f0b23ec3e065c891826856c2341d4ac1..429997c8b89fef7aa164e878095ab3b5c9998e5b 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -24,18 +24,24 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
+template <typename T>
+bool IsType(const std::type_index& type_index) {
+  return type_index == std::type_index(typeid(T));
+}
+
 inline proto::VarType::Type ToVarType(std::type_index type) {
-  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
+  if (IsType<LoDTensor>(type)) {
     return proto::VarType_Type_LOD_TENSOR;
-  } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
+  } else if (IsType<LoDRankTable>(type)) {
     return proto::VarType_Type_LOD_RANK_TABLE;
-  } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
+  } else if (IsType<LoDTensorArray>(type)) {
     return proto::VarType_Type_LOD_TENSOR_ARRAY;
-  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
+  } else if (IsType<SelectedRows>(type)) {
     return proto::VarType_Type_SELECTED_ROWS;
-  } else if (type.hash_code() == typeid(ReaderHolder).hash_code()) {
+  } else if (IsType<ReaderHolder>(type)) {
     return proto::VarType_Type_READER;
-  } else if (type.hash_code() == typeid(ChannelHolder).hash_code()) {
+  } else if (IsType<ChannelHolder>(type)) {
     return proto::VarType_Type_CHANNEL;
   } else {
     PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
diff --git a/paddle/fluid/inference/analysis/README.md b/paddle/fluid/inference/analysis/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6fd73958bc480fe3983b9622c03ac77fba9ec8a7
--- /dev/null
+++ b/paddle/fluid/inference/analysis/README.md
@@ -0,0 +1,57 @@
+# Inference Analysis
+
+The `inference/analysis` module is used to analyze and optimize the inference program,
+it references some philosophy from `LLVM/analysis`, 
+and make the various optimization features be pluggable and co-exist in a pipeline.
+
+We borrowed some concepts from LLVM, such as
+
+- [Pass](./pass.h)es to implement optimization that traverse the inference program,
+- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program,
+- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph.
+
+There are some other basic concepts here
+
+- [Node](./node.h), the node in a `DataFlowGraph`,
+  - `Function`, the Operator in Fluid,
+  - `Value`, the Variable in Fluid;
+- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
+
+## How it works
+
+The `inference/analysis` module make all the passes in a pipeline, and works in such way:
+
+1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc,
+2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes,
+3. Transform a new ProgramDesc from the modified `DataFlowGraph`.
+
+The new optimization features can be added as an independent `Pass` and controlled by gflags,
+each pass will generate unified debug information or visualization for better debugging.
+
+## Supported Passes
+
+### `FluidToDataFlowGraphPass`
+Transform the fluid `ProgramDesc` to a `DataFlowGraph` to give an abstract representation for all the middle passes, 
+this should be the first pass of the pipeline.
+
+### `DataFlowGraphToFluidPass`
+Generate a final `ProgramDesc` from a data flow graph, this should be the last pass of the pipeline.
+
+### `TensorRTSubgraphNodeMarkPass`
+Mark the `Node` that are supported by TensorRT, 
+this pass will generate a visualization file which can be used for debugging.
+
+### `TensorRTSubGraphPass`
+Split the sub-graph that are can be accelerated by TensorRT.
+
+### `DFG_GraphvizDrawPass`
+This pass is just for debug, it will visualize the `DataFlowGraph` using the [graphviz](http://www.graphviz.org) tool.
+
+It can be used as a helper class that draws the modified graph after each pass.
+
+## Utilities
+
+There is some helper legacy/function/class for analysis.
+
+- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
+- [graph_traits.h](./graph_traits.h) contains the graph traversal algorithms, it uses `iterator` to make the algorithms easy to share across different passes.
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index 5d85530969c5bec1c84d5f5b0d2626431a9e1c63..a4625f008c15300b88ef0bce71cd7d8aa473c9a8 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
+#include <string>
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
@@ -79,4 +80,4 @@ void Analyzer::Run(Argument* argument) {
 
 }  // namespace analysis
 }  // namespace inference
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index f290a3777d5be2ef64667d8c17ec59adddc3ef1b..e9e14fb1947da059c8d126d3da182ce446f6421e 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
 /*
  * This file contains Analyzer, an class that exposed as a library that analyze
  * and optimize
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 30c60661f3492034248e164a70a682bae3819d23..a4fefc83e0c551d52bec87299bcbc966e7a2dbf7 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -138,7 +138,7 @@ struct GraphTraits<DataFlowGraph> {
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
 static std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
   std::unordered_set<Node *> nodes(graph.begin(), graph.end());
   std::unordered_set<Node *> inputs;
   std::unordered_set<Node *> outputs;
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index e74efd17b834db1d0314c8b7082f3e9c15d6eda3..29ca008123addf07959b965a4b54bf55b18c401d 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/proto_desc.h"
@@ -150,13 +151,14 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
  public:
   using Config = DFG_GraphvizDrawPass::Config;
-  DFG_DebuggerPass(const Config& config) : DFG_GraphvizDrawPass(config) {}
+  explicit DFG_DebuggerPass(const Config& config)
+      : DFG_GraphvizDrawPass(config) {}
 
   std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
 
   bool Finalize() override { return true; }
 };
-}
+}  // namespace
 
 Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
index 1726e056ed37e2e5fbe2042851ca9bd188806bac..edc84b02ed20991e3e7c6c437d2b1fac169bae03 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -19,6 +19,7 @@
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/pass.h"
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
index b064782586f6243353eda67ac8db040509716b20..17445ab4407a159ca11345bc9a9226b3ad0044f0 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -46,7 +46,7 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass {
     const bool display_deleted_node;
   };
 
-  DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
+  explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
 
   bool Initialize(Argument *argument) override { return true; }
   void Run(DataFlowGraph *graph) override;
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 5d7eb43b7cbd7bc45b5f0c940bf80ad72348e1b9..e918622d74cfb11d83090555be2a768cc14e7742 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "analyzer.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
 
@@ -88,7 +88,8 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
  public:
   using Config = DFG_GraphvizDrawPass::Config;
-  DFG_DebuggerPass(const Config &config) : DFG_GraphvizDrawPass(config) {}
+  explicit DFG_DebuggerPass(const Config &config)
+      : DFG_GraphvizDrawPass(config) {}
   std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
   bool Finalize() override { return true; }
 };
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index fff1621d3f1bb31cfa04110d1f3cf5dbfe927331..f1064cd20f28092d80d3fd23a862da080b6cc2f3 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <cstdio>
 #include <string>
+#include <typeindex>
 #include <unordered_map>
 #include <vector>
 
@@ -41,7 +42,7 @@ int AccuDims(Vec &&vec, int size) {
   return res;
 }
 
-#define SET_TYPE(type__) dic_[typeid(type__).hash_code()] = #type__;
+#define SET_TYPE(type__) dic_[std::type_index(typeid(type__))] = #type__;
 /*
  * Map typeid to representation.
  */
@@ -53,14 +54,14 @@ struct DataTypeNamer {
 
   template <typename T>
   const std::string &repr() const {
-    auto x = typeid(T).hash_code();
+    auto x = std::type_index(typeid(T));
     PADDLE_ENFORCE(dic_.count(x), "unknown type for representation");
     return dic_.at(x);
   }
 
-  const std::string &repr(size_t &hash) const {  // NOLINT
-    PADDLE_ENFORCE(dic_.count(hash), "unknown type for representation");
-    return dic_.at(hash);
+  const std::string &repr(const std::type_index &type) const {  // NOLINT
+    PADDLE_ENFORCE(dic_.count(type), "unknown type for representation");
+    return dic_.at(type);
   }
 
  private:
@@ -71,9 +72,7 @@ struct DataTypeNamer {
     SET_TYPE(void *);
   }
 
-  std::unordered_map<decltype(typeid(int).hash_code()),  // NOLINT
-                     std::string>
-      dic_;
+  std::unordered_map<std::type_index, std::string> dic_;
 };
 #undef SET_TYPE
 
diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc
index d9d265d225bb77a3f5f83cbd0b8b1c670fb34a31..f2e918f3ff41d9db0c3ec38561015967bed26f4e 100644
--- a/paddle/fluid/inference/analysis/node.cc
+++ b/paddle/fluid/inference/analysis/node.cc
@@ -23,9 +23,9 @@ namespace analysis {
 template <>
 std::string &NodeAttr::As<std::string>() {
   if (data_.empty()) {
-    type_hash_ = typeid(std::string).hash_code();
+    type_index_ = std::type_index(typeid(std::string));
   }
-  PADDLE_ENFORCE_EQ(type_hash_, typeid(std::string).hash_code());
+  PADDLE_ENFORCE_EQ(type_index_, std::type_index(typeid(std::string)));
   return data_;
 }
 
diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h
index 8ecd1ae730e6ec6775f4a22fdc5dec0e8ca8e2d1..47e524bc5c4a6b1324d5f182053129311487522d 100644
--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/inference/analysis/device.h"
 #include "paddle/fluid/inference/analysis/dot.h"
 #include "paddle/fluid/inference/analysis/helper.h"
@@ -57,12 +58,12 @@ struct NodeAttr {
     // init storage in the first usage.
     if (data_.empty()) {
       VLOG(4) << "resize data to " << sizeof(T);
-      type_hash_ = typeid(T).hash_code();
+      type_index_ = std::type_index(typeid(T));
       data_.resize(sizeof(T));
     }
-    PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
+    PADDLE_ENFORCE(framework::IsType<T>(type_index_),
                    "type not matched, origin is %s, want %s",
-                   DataTypeNamer::Global().repr(type_hash_),
+                   DataTypeNamer::Global().repr(type_index_),
                    DataTypeNamer::Global().repr<T>());
     PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
     return *reinterpret_cast<T *>(&data_[0]);
@@ -70,7 +71,7 @@ struct NodeAttr {
 
  private:
   std::string data_;
-  size_t type_hash_{std::numeric_limits<size_t>::max()};
+  std::type_index type_index_{typeid(NodeAttr)};
 };
 
 /*
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
index 6caba8f04237e014c5ddf1a3a077bcbadb0ddb71..dac1c509d728114bd24a2ea1150c407646026fd4 100644
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 
-#include <gtest/gtest.h>
-
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
index 5ad092a9ed201e5e6ab7770bcfd9ddf871779c12..f736e385c11add152dc9ab9485bf1de40f80b2f3 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+#include <string>
+
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/node_attr_flags.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
 
 namespace paddle {
 namespace inference {
@@ -29,7 +31,7 @@ void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
 
 class DfgDebuggerPass : public DFG_GraphvizDrawPass {
  public:
-  DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
+  explicit DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
       : DFG_GraphvizDrawPass(config) {}
 
   std::string repr() const override {
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
index 6cfac55d3b7b501e8ccc141cb7309f1428478672..c558a6ebbde371071c7330a14cc986bf764d1773 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
@@ -16,6 +16,10 @@
  * This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops
  * that supported by TensorRT engine.
  */
+
+#pragma once
+
+#include <string>
 #include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
 
@@ -30,7 +34,8 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
  public:
   using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller;
 
-  TensorRTSubgraphNodeMarkPass(const teller_t& teller) : teller_(teller) {}
+  explicit TensorRTSubgraphNodeMarkPass(const teller_t& teller)
+      : teller_(teller) {}
 
   bool Initialize(Argument* argument) override { return true; }
 
@@ -38,8 +43,10 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
   // sub-graph into TensorRT.
   void Run(DataFlowGraph* graph) override;
 
-  std::string repr() const { return "tensorrt-sub-subgraph-mark"; }
-  std::string description() const { return "tensorrt sub-graph mark pass"; }
+  std::string repr() const override { return "tensorrt-sub-subgraph-mark"; }
+  std::string description() const override {
+    return "tensorrt sub-graph mark pass";
+  }
 
   Pass* CreateGraphvizDebugerPass() const override;
   bool Finalize() override;
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
index 11e088069538414c79371b920cb8fa1509b24bb1..c6741a92095d33d261a4e1667c87a8ca02e51a9f 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/inference/analysis/node.h"
 #include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
@@ -30,7 +31,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
   // Tell whether to transform a sub-graph into TensorRT.
   using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller;
 
-  TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
+  explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
 
   bool Initialize(Argument* argument) override { return true; }
 
@@ -40,8 +41,8 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
 
   bool Finalize() override { return true; }
 
-  std::string repr() const { return "tensorrt-sub-graph"; }
-  std::string description() const { return "tensorrt sub graph pass"; }
+  std::string repr() const override { return "tensorrt-sub-graph"; }
+  std::string description() const override { return "tensorrt sub graph pass"; }
 
  private:
   NodeInsideSubgraphTeller node_inside_subgraph_teller_;
@@ -49,4 +50,4 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
 
 }  // namespace analysis
 }  // namespace inference
-}  // paddle
+}  // namespace paddle
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 0c74f62de5c6f5d432ee928945db6dcf385ca209..bd98ed81899440a46415d30b6d74fec2dac4c155 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -20,6 +20,12 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
+DEFINE_bool(init_allocated_mem, false,
+            "It is a mistake that the values of the memory allocated by "
+            "BuddyAllocator are always zeroed in some op's implementation. "
+            "To find this error in time, we use init_allocated_mem to indicate "
+            "that initializing the allocated memory with a small value "
+            "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
 
 namespace paddle {
@@ -41,6 +47,9 @@ template <>
 void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   void* p = GetCPUBuddyAllocator()->Alloc(size);
+  if (FLAGS_init_allocated_mem) {
+    memset(p, 0xEF, size);
+  }
   VLOG(10) << "  pointer=" << p;
   return p;
 }
@@ -104,6 +113,9 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
     LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
     platform::SetDeviceId(cur_dev);
   }
+  if (FLAGS_init_allocated_mem) {
+    cudaMemset(ptr, 0xEF, size);
+  }
   return ptr;
 }
 
@@ -137,6 +149,9 @@ void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
     LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
                  << " bytes in CUDAPinnedPlace";
   }
+  if (FLAGS_init_allocated_mem) {
+    memset(ptr, 0xEF, size);
+  }
   return ptr;
 }
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 9dc39ad0ddf8c5de3e1960a1171431e026de35ae..ab1d2143330fb8cbfd535758a83bc71de939c4e0 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -184,6 +184,7 @@ else()
     set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
 
+set(DISTRIBUTE_DEPS "")
 if(WITH_DISTRIBUTE)
     add_subdirectory(distributed)
     
@@ -192,6 +193,18 @@ if(WITH_DISTRIBUTE)
         set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
     else()
         set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib)
+        if(WITH_BRPC_RDMA)
+            find_library(IBVERBS_LIBRARY NAMES ibverbs)
+            ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
+            SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
+
+
+            find_library(RDMACM_LIBRARY NAMES rdmacm)
+            ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
+            SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
+
+            set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm)
+        endif()
     endif()
 
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
@@ -205,7 +218,7 @@ if(WITH_DISTRIBUTE)
     #        listen_and_serv_op sum_op executor SERIAL)
     if(WITH_GPU)
         set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op executor SERIAL)
+        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL)
         if(WITH_GRPC)
             op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
         else()
@@ -297,6 +310,7 @@ foreach(src ${DETECTION_LIBRARY})
 endforeach()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency")
 
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2f5a2545701991263c1ef842e9275b1edbfd2ca
--- /dev/null
+++ b/paddle/fluid/operators/argsort_op.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/argsort_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ArgsortOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ArgsortOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ArgsortOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
+                   "Output(Indices) of ArgsortOp should not be null.");
+
+    auto in_dims = ctx->GetInputDim("X");
+    int axis = ctx->Attrs().Get<int>("axis");
+
+    auto num_dims = in_dims.size();
+    PADDLE_ENFORCE(axis < num_dims,
+                   "Attr(axis) %d of ArgsortOp is out of bounds for Input(X)'s "
+                   "rank %d.",
+                   axis, num_dims);
+    PADDLE_ENFORCE(axis >= -num_dims,
+                   "Attr(axis) %d of ArgsortOp must be not less than "
+                   "-rank(Input(X)) (%d).",
+                   axis, num_dims);
+
+    ctx->SetOutputDim("Out", in_dims);
+    ctx->SetOutputDim("Indices", in_dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
+  }
+};
+
+class ArgsortOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input of Argsort op.");
+    AddOutput("Out",
+              "(Tensor) The sorted tensor of Argsort op, with the same "
+              "shape as Input(X).");
+    AddOutput("Indices",
+              "(Tensor) The indices of a tensor giving the sorted order, with "
+              "the same shape as Input(X).");
+    AddComment(R"DOC(
+Argsort operator
+
+Performs sorting on the input tensor along the given axis and outputs two 
+tensors, Output(Out) and Output(Indices). They reserve the same shape 
+with Input(X), and Output(Out) represents the sorted tensor while 
+Output(Indices) gives the sorted order along the given axis Attr(axis).
+
+ )DOC");
+    AddAttr<int>("axis",
+                 "(int, default -1) The axis along which to sort the tensor. "
+                 "When axis < 0, the actual axis will be the |axis|'th "
+                 "counting backwards. Default -1, the last dimension.")
+        .SetDefault(-1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(argsort,
+                       ops::ArgsortKernel<paddle::platform::CPUPlace, float>,
+                       ops::ArgsortKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7d5199aae7da4eed5afa6b8bd64c04a540b915d4
--- /dev/null
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/argsort_op.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+const int kMaxRank = 9;  // The max rank of a tensor allowed in Fluid
+
+__global__ void ComputeTargetIdx(const int64_t* in_dims, int dims_size,
+                                 int axis, int64_t n, int64_t* trg_idx,
+                                 int64_t* med_ids) {
+  int64_t index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < n) {
+    int64_t shape_out_axis[kMaxRank - 1] = {0};
+    int64_t dims_out_axis[kMaxRank - 1] = {0};
+    int64_t tmp = index;
+    int64_t pos_in_axis = 0;
+    int64_t i = dims_size - 2;
+    int64_t dim_axis = 0;
+    for (int64_t j = dims_size - 1; j >= 0; --j) {
+      int64_t dim = in_dims[j];
+      if (j != axis) {
+        shape_out_axis[i] = tmp % dim;
+        dims_out_axis[i] = dim;
+        i--;
+      } else {
+        dim_axis = dim;
+        pos_in_axis = tmp % dim_axis;
+      }
+      tmp /= dim;
+    }
+    int64_t group = (dims_size > 1) ? shape_out_axis[0] : 0;
+    for (int64_t j = 0; j < dims_size - 2; ++j) {
+      group = group * dims_out_axis[j + 1] + shape_out_axis[j + 1];
+    }
+
+    int64_t traget_idx = group * dim_axis + pos_in_axis;
+    trg_idx[index] = traget_idx;
+    med_ids[traget_idx] = pos_in_axis;
+  }
+}
+
+template <typename T>
+__global__ void PermuteInData(const T* in, const int64_t* trg_idx, int64_t n,
+                              T* med_out) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < n) {
+    med_out[trg_idx[index]] = in[index];
+  }
+}
+
+template <typename T>
+__global__ void Sort(int64_t axis_dim, int64_t groups, T* med_out,
+                     int64_t* med_ids) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < groups) {
+    thrust::sort_by_key(thrust::device, med_out + index * axis_dim,
+                        med_out + axis_dim * (1 + index),
+                        med_ids + index * axis_dim);
+  }
+}
+
+template <typename T>
+__global__ void PermuteMediateData(const T* med_out, const int64_t* med_ids,
+                                   const int64_t* trg_idx, int64_t n, T* out,
+                                   int64_t* indices) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < n) {
+    out[index] = med_out[trg_idx[index]];
+    indices[index] = med_ids[trg_idx[index]];
+  }
+}
+
+template <typename T>
+class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
+    int axis = ctx.Attr<int>("axis");
+
+    auto in_dims = input->dims();
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+    const T* in_data = input->data<T>();
+    T* out_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    int64_t numel = input->numel();
+    int64_t groups = numel / in_dims[axis];
+
+    std::vector<int64_t> in_dims_vec = vectorize(in_dims);
+    thrust::device_vector<int64_t> in_dims_dev(in_dims_vec.begin(),
+                                               in_dims_vec.end());
+    int64_t* in_dims_data = thrust::raw_pointer_cast(in_dims_dev.data());
+    // Mediate tensor for sorting data and indices
+    Tensor mediate_output, mediate_indices;
+    T* med_out_data =
+        mediate_output.mutable_data<T>(input->dims(), ctx.GetPlace());
+    int64_t* med_ids_data =
+        mediate_indices.mutable_data<int64_t>(in_dims, ctx.GetPlace());
+    // Target index of each element along the given axis in the mediate tensors
+    Tensor trg_idx_t;
+    int64_t* trg_idx = trg_idx_t.mutable_data<int64_t>(in_dims, ctx.GetPlace());
+
+    auto stream = ctx.cuda_device_context().stream();
+    const int num_threads = PADDLE_CUDA_NUM_THREADS;
+
+    ComputeTargetIdx<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>(
+        in_dims_data, in_dims.size(), axis, numel, trg_idx, med_ids_data);
+
+    PermuteInData<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>(
+        in_data, trg_idx, numel, med_out_data);
+
+    Sort<<<(groups - 1) / num_threads + 1, num_threads, 0, stream>>>(
+        in_dims[axis], groups, med_out_data, med_ids_data);
+
+    PermuteMediateData<<<(numel - 1) / num_threads + 1, num_threads, 0,
+                         stream>>>(med_out_data, med_ids_data, trg_idx, numel,
+                                   out_data, ids_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(argsort, paddle::operators::ArgsortOpCUDAKernel<float>,
+                        paddle::operators::ArgsortOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e9112cfb7cbe5f783b04729fb4dff3676c922bc
--- /dev/null
+++ b/paddle/fluid/operators/argsort_op.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ArgsortKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output = ctx.Output<framework::Tensor>("Out");
+    auto* indices = ctx.Output<framework::Tensor>("Indices");
+    int axis = ctx.Attr<int>("axis");
+
+    auto in_dims = input->dims();
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+    const T* in_data = input->data<T>();
+    T* out_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    int64_t groups = input->numel() / in_dims[axis];
+    int64_t stride = (axis == in_dims.size() - 1)
+                         ? 1
+                         : framework::product(framework::slice_ddim(
+                               in_dims, axis + 1, in_dims.size()));
+
+    for (int64_t i = 0; i < groups; ++i) {
+      int64_t idx = i;
+      std::vector<int64_t> shape_vec(in_dims.size(), 0);
+      for (int64_t dim = in_dims.size() - 1; dim >= 0; --dim) {
+        if (dim != axis) {
+          shape_vec[dim] = idx % in_dims[dim];
+          idx /= in_dims[dim];
+        }
+      }
+
+      int64_t start_index = shape_vec[0];
+      for (int64_t dim = 0; dim < in_dims.size() - 1; ++dim) {
+        start_index = start_index * in_dims[dim + 1] + shape_vec[dim + 1];
+      }
+
+      std::vector<int64_t> org_index_vec(in_dims[axis], start_index);
+      for (int64_t j = 1; j < in_dims[axis]; ++j) {
+        org_index_vec[j] += j * stride;
+      }
+
+      std::sort(org_index_vec.begin(), org_index_vec.end(),
+                [in_data](const int64_t v1, const int64_t v2) {
+                  return in_data[v1] < in_data[v2];
+                });
+
+      for (size_t j = 0; j < org_index_vec.size(); ++j) {
+        int64_t index = start_index + j * stride;
+        out_data[index] = in_data[org_index_vec[j]];
+        ids_data[index] = (org_index_vec[j] - start_index) / stride;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
index 6ecb43c49c30f9da2a273d506f7b85c0a4f5fa2c..9ab2179b5fe689762704039c5f67dd080e530aa5 100644
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -115,9 +115,12 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
 
     // create mkldnn memory from input x tensor
-    auto src_memory =
-        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
-               to_void_cast(x_data));
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
+
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
 
     // create primitive descriptor for batch norm forward
     using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
@@ -251,15 +254,21 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
 
     // create mkldnn memory from input diff_y tensor
-    auto user_diff_dst_memory =
-        memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
-                mkldnn_engine},
-               to_void_cast(diff_y_data));
+
+    mkldnn::memory::format dst_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
+
+    auto user_diff_dst_memory = memory(
+        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
+        to_void_cast(diff_y_data));
 
     // create mkldnn memory from input x tensor
-    auto src_memory =
-        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
-               to_void_cast(x_data));
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
+
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
 
     // for diff_dst, try to use same format as dst in forward pass
     auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index 5984f80d04bdeb232f8e24264ae979725af24ef4..8cc1d94260baccfe28d213b7e021956819e2e79e 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
 
 namespace paddle {
 namespace operators {
@@ -47,7 +48,7 @@ class ConditionalOp : public framework::OperatorBase {
     if (!(ips.size() == 1UL && ips[0]->IsInitialized())) {
       PADDLE_THROW("should have one initialized input as condition");
     }
-    if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() &&  // NOLINT
+    if (!(framework::IsType<bool>(ips[0]->type()) &&  // NOLINT
           ips[0]->numel() == 1)) {
       PADDLE_THROW(
           "condition input's data type should be bool, "
diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/detail/macros.h
index b9e385994efcea0388756e8bd780ebfc719ed08d..6f4a15caa5542a45cd8e26a72b055ca8948069d0 100644
--- a/paddle/fluid/operators/detail/macros.h
+++ b/paddle/fluid/operators/detail/macros.h
@@ -14,14 +14,22 @@
 
 #pragma once
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+
 #ifdef PADDLE_WITH_GRPC
+
 #include "paddle/fluid/operators/distributed/grpc_client.h"
 #include "paddle/fluid/operators/distributed/grpc_server.h"
-#define RPCSERVER_T distributed::AsyncGRPCServer
-#define RPCCLIENT_T distributed::GRPCClient
-#else
+#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer
+#define RPCCLIENT_T paddle::operators::distributed::GRPCClient
+
+#else  // PADDLE_WITH_GRPC
+
 #include "paddle/fluid/operators/distributed/brpc_client.h"
 #include "paddle/fluid/operators/distributed/brpc_server.h"
-#define RPCSERVER_T distributed::AsyncBRPCServer
-#define RPCCLIENT_T distributed::BRPCClient
-#endif
+#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer
+#define RPCCLIENT_T paddle::operators::distributed::BRPCClient
+
+#endif  // PADDLE_WITH_GRPC
+
+#endif  // PADDLE_WITH_DISTRIBUTE
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 20d960f9fee1eae42b2241fb96c163e15db5e24d..6d296ff7bf14de9175dc589dfa8b46c534127ca1 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -22,6 +22,8 @@ iou_similarity_op.cu)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
 detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
+detection_library(anchor_generator_op SRCS anchor_generator_op.cc
+anchor_generator_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc
 target_assign_op.cu)
 detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0c0155a0a977846b1300d93b4c3fef0e71fc1d26
--- /dev/null
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/anchor_generator_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AnchorGeneratorOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of AnchorGeneratorOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Anchors"),
+                   "Output(Anchors) of AnchorGeneratorOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Variances"),
+        "Output(Variances) of AnchorGeneratorOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+
+    auto anchor_sizes = ctx->Attrs().Get<std::vector<float>>("anchor_sizes");
+    auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
+    auto stride = ctx->Attrs().Get<std::vector<float>>("stride");
+    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
+
+    size_t num_anchors = aspect_ratios.size() * anchor_sizes.size();
+
+    std::vector<int64_t> dim_vec(4);
+    dim_vec[0] = input_dims[2];
+    dim_vec[1] = input_dims[3];
+    dim_vec[2] = num_anchors;
+    dim_vec[3] = 4;
+    ctx->SetOutputDim("Anchors", framework::make_ddim(dim_vec));
+    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input",
+             "(Tensor, default Tensor<float>), "
+             "the input feature is a tensor with a rank of 4. "
+             "The layout is NCHW.");
+    AddOutput("Anchors",
+              "(Tensor, default Tensor<float>), the output is a "
+              "tensor with a rank of 4. The layout is [H, W, num_anchors, 4]. "
+              "H is the height of input, W is the width of input, num_anchors "
+              "is the box count of each position. "
+              "Each anchor is in (xmin, ymin, xmax, ymax) format");
+    AddOutput("Variances",
+              "(Tensor, default Tensor<float>), the expanded variances for "
+              "normalizing bbox regression targets. The layout is [H, W, "
+              "num_anchors, 4]. "
+              "H is the height of input, W is the width of input, num_anchors "
+              "is the box count of each position. "
+              "Each variance is in (xcenter, ycenter, w, h) format");
+
+    AddAttr<std::vector<float>>(
+        "anchor_sizes",
+        "(vector<float>) List of Region Proposal Network(RPN) anchor sizes "
+        " given in absolute pixels e.g. (64, 128, 256, 512)."
+        " For instance, the anchor size of 64 means the area of this anchor "
+        "equals to 64**2.")
+        .AddCustomChecker([](const std::vector<float>& anchor_sizes) {
+          PADDLE_ENFORCE_GT(anchor_sizes.size(), 0,
+                            "Size of anchor_sizes must be at least 1.");
+          for (size_t i = 0; i < anchor_sizes.size(); ++i) {
+            PADDLE_ENFORCE_GT(anchor_sizes[i], 0.0,
+                              "anchor_sizes[%d] must be positive.", i);
+          }
+        });
+    AddAttr<std::vector<float>>(
+        "aspect_ratios",
+        "(vector<float>) List of Region Proposal Network(RPN) anchor aspect "
+        "ratios, e.g. (0.5, 1, 2)."
+        "For instacne, the aspect ratio of 0.5 means the height / width of "
+        "this anchor equals 0.5.");
+
+    AddAttr<std::vector<float>>("variances",
+                                "(vector<float>) List of variances to be used "
+                                "in box regression deltas")
+        .AddCustomChecker([](const std::vector<float>& variances) {
+          PADDLE_ENFORCE_EQ(variances.size(), 4,
+                            "Must and only provide 4 variance.");
+          for (size_t i = 0; i < variances.size(); ++i) {
+            PADDLE_ENFORCE_GT(variances[i], 0.0,
+                              "variance[%d] must be greater than 0.", i);
+          }
+        });
+
+    AddAttr<std::vector<float>>("stride",
+                                "Anchors stride across width and height, "
+                                "with a default of (16, 16)")
+        .SetDefault(std::vector<float>(2, 16.0))
+        .AddCustomChecker([](const std::vector<float>& stride) {
+          PADDLE_ENFORCE_EQ(
+              stride.size(), 2,
+              "Must and only provide 2 stride for width and height.");
+          for (size_t i = 0; i < stride.size(); ++i) {
+            PADDLE_ENFORCE_GT(stride[i], 0.0,
+                              "stride[%d] should be larger than 0.", i);
+          }
+        });
+
+    AddAttr<float>("offset",
+                   "(float) "
+                   "Anchor center offset, with a default of 0.5")
+        .SetDefault(0.5);
+    AddComment(R"DOC(
+AnchorGenerator operator
+Generates anchors for Faster RCNN, FPN etc. algorithm.
+Each position of the input produce N anchors, N =
+ size(anchor_sizes) * size(aspect_ratios).
+
+Please get more information from the following papers:
+https://arxiv.org/abs/1506.01497.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(anchor_generator, ops::AnchorGeneratorOp,
+                  ops::AnchorGeneratorOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(anchor_generator, ops::AnchorGeneratorOpKernel<float>,
+                       ops::AnchorGeneratorOpKernel<double>);
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3cc9bbeee1eeed17142a6b1bd23b45aff9cf745f
--- /dev/null
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cu
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/anchor_generator_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
+                           const T* anchor_sizes, const int as_num,
+                           const T* stride, const int sd_num, const int height,
+                           const int width, const T offset) {
+  int num_anchors = as_num * ar_num;
+  int box_num = height * width * num_anchors;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
+       i += blockDim.x * gridDim.x) {
+    int h_idx = i / (num_anchors * width);
+    int w_idx = (i / num_anchors) % width;
+    T stride_width = stride[0];
+    T stride_height = stride[1];
+    T x_ctr = (w_idx * stride_width) + offset * (stride_width - 1);
+    T y_ctr = (h_idx * stride_height) + offset * (stride_height - 1);
+    T area, area_ratios;
+    T base_w, base_h;
+    T scale_w, scale_h;
+    T anchor_width, anchor_height;
+    int anch_idx = i % num_anchors;
+    int ar_idx = anch_idx / as_num;
+    int as_idx = anch_idx % as_num;
+    T aspect_ratio = aspect_ratios[ar_idx];
+    T anchor_size = anchor_sizes[as_idx];
+    area = stride_width * stride_height;
+    area_ratios = area / aspect_ratio;
+    base_w = round(sqrt(area_ratios));
+    base_h = round(base_w * aspect_ratio);
+    scale_w = anchor_size / stride_width;
+    scale_h = anchor_size / stride_height;
+    anchor_width = scale_w * base_w;
+    anchor_height = scale_h * base_h;
+
+    T xmin = (x_ctr - 0.5 * (anchor_width - 1));
+    T ymin = (y_ctr - 0.5 * (anchor_height - 1));
+    T xmax = (x_ctr + 0.5 * (anchor_width - 1));
+    T ymax = (y_ctr + 0.5 * (anchor_height - 1));
+    out[i * 4] = xmin;
+    out[i * 4 + 1] = ymin;
+    out[i * 4 + 2] = xmax;
+    out[i * 4 + 3] = ymax;
+  }
+}
+
+template <typename T>
+__global__ void SetVariance(T* out, const T* var, const int vnum,
+                            const int num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    out[i] = var[i % vnum];
+  }
+}
+
+template <typename T>
+class AnchorGeneratorOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* anchors = ctx.Output<paddle::framework::Tensor>("Anchors");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto anchor_sizes = ctx.Attr<std::vector<float>>("anchor_sizes");
+    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto stride = ctx.Attr<std::vector<float>>("stride");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto width = input->dims()[3];
+    auto height = input->dims()[2];
+
+    int num_anchors = aspect_ratios.size() * anchor_sizes.size();
+
+    int box_num = width * height * num_anchors;
+
+    int block = 512;
+    int grid = (box_num + block - 1) / block;
+
+    auto stream =
+        ctx.template device_context<platform::CUDADeviceContext>().stream();
+
+    anchors->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    framework::Tensor ar;
+    framework::TensorFromVector(aspect_ratios, ctx.device_context(), &ar);
+
+    framework::Tensor as;
+    framework::TensorFromVector(anchor_sizes, ctx.device_context(), &as);
+
+    framework::Tensor sd;
+    framework::TensorFromVector(stride, ctx.device_context(), &sd);
+
+    GenAnchors<T><<<grid, block, 0, stream>>>(
+        anchors->data<T>(), ar.data<T>(), aspect_ratios.size(), as.data<T>(),
+        anchor_sizes.size(), sd.data<T>(), stride.size(), height, width,
+        offset);
+
+    framework::Tensor v;
+    framework::TensorFromVector(variances, ctx.device_context(), &v);
+    grid = (box_num * 4 + block - 1) / block;
+    SetVariance<T><<<grid, block, 0, stream>>>(vars->data<T>(), v.data<T>(),
+                                               variances.size(), box_num * 4);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(anchor_generator,
+                        ops::AnchorGeneratorOpCUDAKernel<float>,
+                        ops::AnchorGeneratorOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0e499d76a19ba5f6b91ba4c8797684fb53c7caa
--- /dev/null
+++ b/paddle/fluid/operators/detection/anchor_generator_op.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class AnchorGeneratorOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* anchors = ctx.Output<paddle::framework::Tensor>("Anchors");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto anchor_sizes = ctx.Attr<std::vector<float>>("anchor_sizes");
+    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto stride = ctx.Attr<std::vector<float>>("stride");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    T stride_width, stride_height;
+    stride_width = stride[0];
+    stride_height = stride[1];
+
+    int num_anchors = aspect_ratios.size() * anchor_sizes.size();
+
+    anchors->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    auto e_anchors = framework::EigenTensor<T, 4>::From(*anchors);
+    for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
+      for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
+        T x_ctr = (w_idx * stride_width) + offset * (stride_width - 1);
+        T y_ctr = (h_idx * stride_height) + offset * (stride_height - 1);
+        T area, area_ratios;
+        T base_w, base_h;
+        T scale_w, scale_h;
+        T anchor_width, anchor_height;
+        int idx = 0;
+        for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+          auto ar = aspect_ratios[r];
+          for (size_t s = 0; s < anchor_sizes.size(); ++s) {
+            auto anchor_size = anchor_sizes[s];
+            area = stride_width * stride_height;
+            area_ratios = area / ar;
+            base_w = round(sqrt(area_ratios));
+            base_h = round(base_w * ar);
+            scale_w = anchor_size / stride_width;
+            scale_h = anchor_size / stride_height;
+            anchor_width = scale_w * base_w;
+            anchor_height = scale_h * base_h;
+            e_anchors(h_idx, w_idx, idx, 0) =
+                (x_ctr - 0.5 * (anchor_width - 1));
+            e_anchors(h_idx, w_idx, idx, 1) =
+                (y_ctr - 0.5 * (anchor_height - 1));
+            e_anchors(h_idx, w_idx, idx, 2) =
+                (x_ctr + 0.5 * (anchor_width - 1));
+            e_anchors(h_idx, w_idx, idx, 3) =
+                (y_ctr + 0.5 * (anchor_height - 1));
+            idx++;
+          }
+        }
+      }
+    }
+
+    framework::Tensor var_t;
+    var_t.mutable_data<T>(
+        framework::make_ddim({1, static_cast<int>(variances.size())}),
+        ctx.GetPlace());
+    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
+    for (size_t i = 0; i < variances.size(); ++i) {
+      var_et(0, i) = variances[i];
+    }
+
+    int anchor_num = feature_height * feature_width * num_anchors;
+    auto var_dim = vars->dims();
+    vars->Resize({anchor_num, static_cast<int>(variances.size())});
+
+    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
+    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(anchor_num, 1));
+
+    vars->Resize(var_dim);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 8228a8c5a3eae73fe82551c8aad55290b0d54ef0..4a09f3870d64d8e14b2db41ff3ea7c2f9e67b558 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -35,10 +35,20 @@ void GRPCClient::InitEventLoop() {
   client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
 }
 
-void GRPCClient::SendComplete() {
+void GRPCClient::SendBeginPass() {
   for (auto& it : channels_) {
-    this->AsyncSendComplete(it.first);
+    VLOG(3) << "send begin pass to: " << it.first;
+    this->AsyncSendBeginPass(it.first);
   }
+  this->Wait();
+}
+
+void GRPCClient::SendEndPass() {
+  for (auto& it : channels_) {
+    VLOG(3) << "send end pass to " << it.first;
+    this->AsyncSendEndPass(it.first);
+  }
+  this->Wait();
 }
 
 GRPCClient::~GRPCClient() {
@@ -226,19 +236,32 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   req_count_++;
 }
 
-void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
+void GRPCClient::AsyncSendBeginPass(const std::string& ep, int64_t time_out) {
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
   s->Prepare(time_out);
 
   sendrecv::VariableMessage req;
-  req.set_varname(COMPLETE_MESSAGE);
+  req.set_varname(BEGIN_PASS_MESSAGE);
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
 }
 
+void GRPCClient::AsyncSendEndPass(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(END_PASS_MESSAGE);
+  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
 void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
                                        const std::string& dir,
                                        int64_t time_out) {
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index 7a08f2d3a4a28a4323723e6b887c50588eed2bce..5dae20155edcf9edd746a5d9a9bbe0ccd789f431 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -77,11 +77,12 @@ class BaseProcessor {
     context_.reset(new grpc::ClientContext());
     var_h_ = var_info;
     context_->set_wait_for_ready(true);
-
-    std::chrono::system_clock::time_point deadline =
-        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
-
-    context_->set_deadline(deadline);
+    if (time_out) {
+      std::chrono::system_clock::time_point deadline =
+          std::chrono::system_clock::now() +
+          std::chrono::milliseconds(time_out);
+      context_->set_deadline(deadline);
+    }
   }
 
   virtual void Prepare(int64_t time_out) {
@@ -214,9 +215,17 @@ class GRPCClient : public RPCClient {
   void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
                              int64_t time_out = FLAGS_rpc_deadline) override;
 
+  void AsyncSendBeginPass(const std::string& ep,
+                          int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendEndPass(const std::string& ep,
+                        int64_t time_out = FLAGS_rpc_deadline) override;
+
   void Wait() override;
 
-  void SendComplete() override;
+  void SendBeginPass() override;
+
+  void SendEndPass() override;
 
  protected:
   void InitImpl() override;
@@ -227,9 +236,6 @@ class GRPCClient : public RPCClient {
 
   void Proceed();
 
-  void AsyncSendComplete(const std::string& ep,
-                         int64_t time_out = FLAGS_rpc_deadline);
-
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
 
  private:
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 90742a201ad46447d6fbbe2137aa40fabc2f9983..271306d5d20f1b849a81a9bfa6436f2faf261204 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -37,11 +37,14 @@ constexpr char kRequestSend[] = "RequestSend";
 constexpr char kRequestGet[] = "RequestGet";
 constexpr char kRequestPrefetch[] = "RequestPrefetch";
 constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
+constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
 
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
 #define COMPLETE_MESSAGE "COMPLETE@RECV"
+#define BEGIN_PASS_MESSAGE "BEGIN_PASS@RECV"
+#define END_PASS_MESSAGE "END_PASS@RECV"
 
 #define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
 #define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 163154c678f65b08981041d647b11f4b2b5860ba..5e6bff20f5f8c06e1497c697e3aabf7b9cb94ad6 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -55,14 +55,14 @@ bool RequestSendHandler::Handle(const std::string& varname,
   if (varname == BATCH_BARRIER_MESSAGE) {
     VLOG(3) << "sync: recv batch barrier message";
     rpc_server_->IncreaseBatchBarrier(kRequestSend);
-  } else if (varname == COMPLETE_MESSAGE) {
-    VLOG(3) << "sync: recv complete message";
-    rpc_server_->DecreaseClientNum();
+  } else if (varname == BEGIN_PASS_MESSAGE) {
+    VLOG(3) << "sync: recv begin pass message";
+    rpc_server_->WaitCond(kRequestSend);
+    rpc_server_->BeginPass();
   } else {
     VLOG(3) << "sync: received var_name: " << varname;
-    if (sync_mode_) {
-      rpc_server_->WaitCond(kRequestSend);
-    }
+    rpc_server_->WaitCond(kRequestSend);
+    VLOG(3) << "sync: processing received var: " << varname;
 
     if (invar == nullptr) {
       LOG(ERROR) << "sync: Can not find server side var: " << varname;
@@ -91,21 +91,21 @@ bool RequestGetHandler::Handle(const std::string& varname,
                                framework::Variable** outvar,
                                const std::string& out_var_name) {
   VLOG(4) << "RequestGetHandler:" << varname;
-
-  if (varname != FETCH_BARRIER_MESSAGE) {
-    if (sync_mode_) {
+  if (sync_mode_) {
+    if (varname == FETCH_BARRIER_MESSAGE) {
+      VLOG(3) << "sync: recv fetch barrier message";
+      rpc_server_->IncreaseBatchBarrier(kRequestGet);
+    } else if (varname == END_PASS_MESSAGE) {
+      rpc_server_->EndPass();
+    } else {
       rpc_server_->WaitCond(kRequestGet);
+      *outvar = scope_->FindVar(varname);
+    }
+  } else {
+    if (varname != FETCH_BARRIER_MESSAGE && varname != END_PASS_MESSAGE) {
+      *outvar = scope_->FindVar(varname);
     }
-    *outvar = scope_->FindVar(varname);
-    return true;
-  }
-
-  // FETCH_BARRIER_MESSAGE
-  if (sync_mode_) {
-    VLOG(3) << "sync: recv fetch barrier message";
-    rpc_server_->IncreaseBatchBarrier(kRequestGet);
   }
-
   return true;
 }
 
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 37783b78ecc5c58aab3e358066bd7f2fba861799..6479d3a97bafba37b74a1d1c04852a6e60e01be8 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -60,10 +60,17 @@ class RPCClient {
                                      const std::string& dir,
                                      int64_t time_out = FLAGS_rpc_deadline) = 0;
 
-  // SendComplete tells all the server that current trainer have no more data
-  // to train, so that the pserver can reduce it's barrier count, and continue
-  // to train with other trainers.
-  virtual void SendComplete() = 0;
+  virtual void AsyncSendBeginPass(const std::string& ep,
+                                  int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual void AsyncSendEndPass(const std::string& ep,
+                                int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  // BeginePass/EndPass tells all the pserver that start/end a pass, so that
+  // the pserver can increase/reduce it's barrier count, and continue to train
+  // with other trainers.
+  virtual void SendBeginPass() = 0;
+  virtual void SendEndPass() = 0;
 
   virtual void Wait() = 0;
 
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index c0520e248d49f4f390af9075fc6f87ec4bd74c39..d49ee34eeaf4e80f6fd4f8cdc548cc2b938d0f2a 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -44,7 +44,8 @@ void RPCServer::SavePort() const {
 void RPCServer::WaitBarrier(const std::string& rpc_name) {
   std::unique_lock<std::mutex> lock(this->mutex_);
   barrier_cond_.wait(lock, [this, &rpc_name] {
-    return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
+    return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) ||
+            exit_flag_.load());
   });
 
   VLOG(3) << "batch_barrier_: " << rpc_name << " "
@@ -63,10 +64,25 @@ void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
   }
 }
 
-void RPCServer::DecreaseClientNum() {
+void RPCServer::BeginPass() {
+  VLOG(4) << "RPCServer begin increase pass barrier";
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    client_num_++;
+    VLOG(4) << "increase client_num to: " << client_num_;
+  }
+  barrier_cond_.notify_all();
+}
+
+void RPCServer::EndPass() {
+  VLOG(4) << "RPCServer begin increase pass barrier";
   {
     std::unique_lock<std::mutex> lock(mutex_);
     client_num_--;
+    VLOG(4) << "decrease client_num to: " << client_num_;
+    if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
+      barrier_counter_[kRequestGet]--;
+    }
   }
   barrier_cond_.notify_all();
 }
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
index cf25e78435bb470b25a46db647ca818571cc83a5..833991c8aa6e7cfd10f2aa52f9218be7ff8ccebf 100644
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -43,6 +43,9 @@ class RPCServer {
   bool IsExit() { return exit_flag_.load(); }
 
   int GetSelectedPort() const { return selected_port_; }
+
+  int GetClientNum() const;
+
   void SavePort() const;
 
   // RegisterRPC, register the rpc method name to a handler
@@ -60,7 +63,10 @@ class RPCServer {
   void SetCond(const std::string& rpc_name);
   void WaitCond(const std::string& rpc_name);
   void IncreaseBatchBarrier(const std::string rpc_name);
-  void DecreaseClientNum();
+
+  void BeginPass();
+  void EndPass();
+
   void ResetBarrierCounter();
 
  protected:
diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/fc_mkldnn_op.cc
index 847b7b0c12e1679501dbe83d578b23ca2aef3e9e..99fa659a351249a4a93f71700e1c646465861aba 100644
--- a/paddle/fluid/operators/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -115,6 +115,7 @@ class MKLDNNMemory {
 
 template <typename T>
 class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
index a9d47c017275193cdacc7db8f31e8e874b9b84de..d67bec36b3248be8602da562a88aeb58f5effe39 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -26,12 +26,8 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
                    "Input(X) of FillZerosLikeOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillZerosLikeOp should not be null.");
-
-    if (ctx->IsRuntime() &&
-        ctx->GetOutputsVarType("Out")[0] ==
-            framework::proto::VarType::LOD_TENSOR_ARRAY) {
-      return;  // skip runtime infershape when is tensor array;
-    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -43,7 +39,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 FillZerosLike Operator.
 
-Fill up a variable with zeros, supporting both LoDTensor and LoDTensorArray.
+Fill up a variable with zeros.
 The output will have the same size as the input.
 
 )DOC");
diff --git a/paddle/fluid/operators/fill_zeros_like_op.h b/paddle/fluid/operators/fill_zeros_like_op.h
index daa6521b32e583f733bc040afb61bf13c4236731..4bbe0df6b6890122381c87494e510cf125792377 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.h
+++ b/paddle/fluid/operators/fill_zeros_like_op.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -24,29 +23,12 @@ template <typename DeviceContext, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto var = context.InputVar("X");
-    if (var->IsType<framework::LoDTensor>()) {
-      auto& input = *context.Input<framework::LoDTensor>("X");
-      auto& output = *context.Output<framework::LoDTensor>("Out");
-      output.Resize(input.dims());
-      output.set_lod(input.lod());
-      output.mutable_data<T>(context.GetPlace());
-      math::SetConstant<DeviceContext, T> setter;
-      setter(context.template device_context<DeviceContext>(), &(output),
-             static_cast<T>(0));
-    } else if (var->IsType<framework::LoDTensorArray>()) {
-      auto& input = *context.Input<framework::LoDTensorArray>("X");
-      auto& output = *context.Output<framework::LoDTensorArray>("Out");
-      output.resize(input.size());
-      for (auto i = 0; i < input.size(); i++) {
-        output[i].Resize(input[i].dims());
-        output[i].set_lod(input[i].lod());
-        output[i].mutable_data<T>(context.GetPlace());
-        math::SetConstant<DeviceContext, T> setter;
-        setter(context.template device_context<DeviceContext>(), &(output[i]),
-               static_cast<T>(0));
-      }
-    }
+    auto* out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    math::SetConstant<DeviceContext, T> setter;
+    setter(context.template device_context<DeviceContext>(), out,
+           static_cast<T>(0));
   }
 };
 
diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/fluid/operators/math/detail/avx_functions.cc
index b95109d3f73505fa6b5438326804a2b348fb3668..5641f914523771f47bd7f814bfd39964a53deefc 100644
--- a/paddle/fluid/operators/math/detail/avx_functions.cc
+++ b/paddle/fluid/operators/math/detail/avx_functions.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <immintrin.h>
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 // TODO(qingqing) refine this dependence
-#include "paddle/cuda/src/avx_mathfun.h"
+#include "paddle/legacy/cuda/src/avx_mathfun.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index db7634918a5179a61304315ecd08350d23fb4642..cceac402951ae6bf3fe0b4c96af5b7ce9ca1ba0e 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -16,6 +16,7 @@
 #include <ctime>
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
@@ -62,7 +63,7 @@ struct Formater {
     }
   }
   void PrintDtype() {
-    if (dtype.hash_code() != typeid(const char).hash_code()) {
+    if (!framework::IsType<const char>(dtype)) {
       CLOG << "\tdtype: " << dtype.name() << std::endl;
     }
   }
@@ -83,15 +84,15 @@ struct Formater {
   void PrintData(size_t size) {
     PADDLE_ENFORCE_NOT_NULL(data);
     // print float
-    if (dtype.hash_code() == typeid(const float).hash_code()) {
+    if (framework::IsType<const float>(dtype)) {
       Display<float>(size);
-    } else if (dtype.hash_code() == typeid(const double).hash_code()) {
+    } else if (framework::IsType<const double>(dtype)) {
       Display<double>(size);
-    } else if (dtype.hash_code() == typeid(const int).hash_code()) {
+    } else if (framework::IsType<const int>(dtype)) {
       Display<int>(size);
-    } else if (dtype.hash_code() == typeid(const int64_t).hash_code()) {
+    } else if (framework::IsType<const int64_t>(dtype)) {
       Display<int64_t>(size);
-    } else if (dtype.hash_code() == typeid(const bool).hash_code()) {
+    } else if (framework::IsType<const bool>(dtype)) {
       Display<bool>(size);
     } else {
       CLOG << "\tdata: unprintable type: " << dtype.name() << std::endl;
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 72a27d43584d55cd0859c63577ae85ff0f5fdfa8..60e4eb757668e1482090f02aea529aaad3a674d8 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -66,9 +66,19 @@ class ReadOp : public framework::OperatorBase {
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
-    PADDLE_ENFORCE(!ins.empty(), "There is no next data.");
+    if (ins.empty()) {
+      if (Attr<bool>("throw_eof_exp")) {
+        PADDLE_THROW("There is no next data.");
+      } else {
+        ins.resize(out_arg_names.size());
+        for (auto& tensor : ins) {
+          // data type is not important for subsequent DataBalanceOpHandle
+          tensor.mutable_data<float>(framework::make_ddim({0}), dev_place);
+        }
+      }
+    }
     PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
-    for (size_t i = 0; i < ins.size(); ++i) {
+    for (size_t i = 0; i < out_arg_names.size(); ++i) {
       auto* out =
           scope.FindVar(out_arg_names[i])->GetMutable<framework::LoDTensor>();
       out->ShareDataWith(ins[i]);
@@ -82,6 +92,10 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("Reader", "(ReaderHolder) The executed reader.");
     AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable();
+    AddAttr<bool>("throw_eof_exp",
+                  "If set true, an exception will be thrown when the Reader "
+                  "yields empty (which means there is no next data).")
+        .SetDefault(true);
     AddComment(R"DOC(
       Read Operator
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 7f743f577fbcdaf6f62e01031e25ef09a842c2e9..918f3be533d51367eade5f5108ad2eab954a9303 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -12,14 +12,108 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/reshape_op.h"
-
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+class ReshapeOp : public framework::OperatorWithKernel {
+ public:
+  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReshapeOp should not be null.");
+
+    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE(!shape.empty(),
+                   "The shape information must be set by Attr(shape).");
+
+    if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
+      // If true, set the shape of Output(Out) according to Input(Shape) in
+      // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
+      ctx->ShareLoD("X", /*->*/ "Out");
+      return;
+    }
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = ValidateShape(shape, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+
+  static framework::DDim ValidateShape(const std::vector<int> shape,
+                                       const framework::DDim &in_dims) {
+    const int64_t in_size = framework::product(in_dims);
+    // only one dimension can be set to -1, whose size will be automatically
+    // infered.
+    const int64_t unk_dim_val = -1;
+    const int64_t copy_dim_val = 0;
+
+    std::vector<int64_t> output_shape(shape.size(), 0);
+    int64_t capacity = 1;
+    int unk_dim_idx = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (shape[i] == unk_dim_val) {
+        PADDLE_ENFORCE(
+            unk_dim_idx == -1,
+            "Only one input dimension of Attr(shape) can be unknown.");
+        unk_dim_idx = i;
+      } else if (shape[i] == copy_dim_val) {
+        PADDLE_ENFORCE(
+            static_cast<int>(i) < in_dims.size(),
+            "The index of dimension to copy from input shape must be less "
+            "than the size of input shape.");
+      } else {
+        PADDLE_ENFORCE(
+            shape[i] > 0,
+            "Each input dimension of Attr(shape) must not be negtive except "
+            "one unknown dimension.");
+      }
+
+      capacity *= (shape[i] ? shape[i] : in_dims[i]);
+      output_shape[i] =
+          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
+    }
+
+    if (unk_dim_idx != -1) {
+      if (in_size > 0) {
+        // in_size < 0 and is un-determinate in compile time, skip the check,
+        // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
+        // capacity = -24, in_size = -8, output_shape[0] = 0
+        // the following check will fail.
+        output_shape[unk_dim_idx] = -in_size / capacity;
+        PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
+                          "Invalid shape is given.");
+      } else {
+        output_shape[unk_dim_idx] = -1;
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
+    }
+    return framework::make_ddim(output_shape);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -107,19 +201,93 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class ReshapeKernel {
+ public:
+  void operator()(const framework::ExecutionContext &ctx) const {
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *in = ctx.Input<framework::LoDTensor>("X");
+
+    auto *shape_tensor = ctx.HasInput("Shape")
+                             ? ctx.Input<framework::LoDTensor>("Shape")
+                             : nullptr;
+
+    framework::DDim out_dims = out->dims();
+
+    if (shape_tensor) {
+      auto *shape_data = shape_tensor->data<int>();
+      framework::Tensor cpu_shape_tensor;
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
+        shape_data = cpu_shape_tensor.data<int>();
+      }
+      auto shape =
+          std::vector<int>(shape_data, shape_data + shape_tensor->numel());
+      out_dims = ReshapeOp::ValidateShape(shape, in->dims());
+    }
+    if (!in->lod().empty()) {
+      PADDLE_ENFORCE_EQ(
+          out_dims[0], in->dims()[0],
+          "Reshape operator cannot reshape an input sequence batch "
+          "into an output sequence batch that has a different "
+          "number of time steps. Please consider using "
+          "sequence_reshape op.");
+    }
+
+    bool inplace = ctx.Attr<bool>("inplace");
+    out->Resize(out_dims);
+    if (!inplace) {
+      out->mutable_data(ctx.GetPlace(), in->type());
+      framework::TensorCopySync(*in, ctx.GetPlace(), out);
+      out->Resize(out_dims);
+    } else {
+      out->ShareDataWith(*in);
+      out->Resize(out_dims);
+    }
+  }
+};
+
+class ReshapeGradKernel {
+ public:
+  void operator()(const framework::ExecutionContext &ctx) const {
+    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    d_x->mutable_data(ctx.GetPlace(), d_out->type());
+    bool inplace = ctx.Attr<bool>("inplace");
+
+    auto in_dims = d_x->dims();
+    if (!inplace) {
+      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+      ctx.device_context().Wait();
+      d_x->Resize(in_dims);
+    } else {
+      d_x->ShareDataWith(*d_out);
+      d_x->Resize(in_dims);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp);
-REGISTER_OP_CPU_KERNEL(reshape, ops::ReshapeKernel<CPU, float>,
-                       ops::ReshapeKernel<CPU, double>,
-                       ops::ReshapeKernel<CPU, int>,
-                       ops::ReshapeKernel<CPU, int64_t>);
-REGISTER_OP_CPU_KERNEL(reshape_grad, ops::ReshapeGradKernel<CPU, float>,
-                       ops::ReshapeGradKernel<CPU, double>,
-                       ops::ReshapeGradKernel<CPU, int>,
-                       ops::ReshapeGradKernel<CPU, int64_t>);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
+
+#ifdef PADDLE_WITH_CUDA
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
+#endif
diff --git a/paddle/fluid/operators/reshape_op.cu b/paddle/fluid/operators/reshape_op.cu
deleted file mode 100644
index c628c634e2bc9ae260948a6e7ccf786cbd6c5c3c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reshape_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reshape_op.h"
-using CUDA = paddle::platform::CUDADeviceContext;
-
-REGISTER_OP_CUDA_KERNEL(reshape, paddle::operators::ReshapeKernel<CUDA, float>,
-                        paddle::operators::ReshapeKernel<CUDA, double>,
-                        paddle::operators::ReshapeKernel<CUDA, int>,
-                        paddle::operators::ReshapeKernel<CUDA, int64_t>);
-REGISTER_OP_CUDA_KERNEL(reshape_grad,
-                        paddle::operators::ReshapeGradKernel<CUDA, float>,
-                        paddle::operators::ReshapeGradKernel<CUDA, double>,
-                        paddle::operators::ReshapeGradKernel<CUDA, int>,
-                        paddle::operators::ReshapeGradKernel<CUDA, int64_t>);
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
deleted file mode 100644
index 3dd8c7c11eca241e747bfa129962032d882ce44c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reshape_op.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class ReshapeOp : public framework::OperatorWithKernel {
- public:
-  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReshapeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReshapeOp should not be null.");
-
-    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE(!shape.empty(),
-                   "The shape information must be set by Attr(shape).");
-
-    if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
-      // If true, set the shape of Output(Out) according to Input(Shape) in
-      // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
-      ctx->ShareLoD("X", /*->*/ "Out");
-      return;
-    }
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto out_dims = ValidateShape(shape, x_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
-  static framework::DDim ValidateShape(const std::vector<int> shape,
-                                       const framework::DDim &in_dims) {
-    const int64_t in_size = framework::product(in_dims);
-    // only one dimension can be set to -1, whose size will be automatically
-    // infered.
-    const int64_t unk_dim_val = -1;
-    const int64_t copy_dim_val = 0;
-
-    std::vector<int64_t> output_shape(shape.size(), 0);
-    int64_t capacity = 1;
-    int unk_dim_idx = -1;
-    for (size_t i = 0; i < shape.size(); ++i) {
-      if (shape[i] == unk_dim_val) {
-        PADDLE_ENFORCE(
-            unk_dim_idx == -1,
-            "Only one input dimension of Attr(shape) can be unknown.");
-        unk_dim_idx = i;
-      } else if (shape[i] == copy_dim_val) {
-        PADDLE_ENFORCE(
-            static_cast<int>(i) < in_dims.size(),
-            "The index of dimension to copy from input shape must be less "
-            "than the size of input shape.");
-      } else {
-        PADDLE_ENFORCE(
-            shape[i] > 0,
-            "Each input dimension of Attr(shape) must not be negtive except "
-            "one unknown dimension.");
-      }
-
-      capacity *= (shape[i] ? shape[i] : in_dims[i]);
-      output_shape[i] =
-          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
-    }
-
-    if (unk_dim_idx != -1) {
-      if (in_size > 0) {
-        // in_size < 0 and is un-determinate in compile time, skip the check,
-        // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
-        // capacity = -24, in_size = -8, output_shape[0] = 0
-        // the following check will fail.
-        output_shape[unk_dim_idx] = -in_size / capacity;
-        PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
-                          "Invalid shape is given.");
-      } else {
-        output_shape[unk_dim_idx] = -1;
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
-    }
-    return framework::make_ddim(output_shape);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReshapeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const {
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
-    auto *in = ctx.Input<framework::LoDTensor>("X");
-
-    auto *shape_tensor = ctx.HasInput("Shape")
-                             ? ctx.Input<framework::LoDTensor>("Shape")
-                             : nullptr;
-
-    framework::DDim out_dims = out->dims();
-
-    if (shape_tensor) {
-      auto *shape_data = shape_tensor->data<int>();
-      framework::Tensor cpu_shape_tensor;
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
-        shape_data = cpu_shape_tensor.data<int>();
-      }
-      auto shape =
-          std::vector<int>(shape_data, shape_data + shape_tensor->numel());
-      out_dims = ReshapeOp::ValidateShape(shape, in->dims());
-    }
-    if (!in->lod().empty()) {
-      PADDLE_ENFORCE_EQ(
-          out_dims[0], in->dims()[0],
-          "Reshape operator cannot reshape an input sequence batch "
-          "into an output sequence batch that has a different "
-          "number of time steps. Please consider using "
-          "sequence_reshape op.");
-    }
-
-    bool inplace = ctx.Attr<bool>("inplace");
-    out->Resize(out_dims);
-    if (!inplace) {
-      out->mutable_data<T>(ctx.GetPlace());
-      framework::TensorCopySync(*in, ctx.GetPlace(), out);
-      out->Resize(out_dims);
-    } else {
-      out->ShareDataWith(*in);
-      out->Resize(out_dims);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReshapeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const {
-    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    bool inplace = ctx.Attr<bool>("inplace");
-
-    auto in_dims = d_x->dims();
-    if (!inplace) {
-      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
-      ctx.device_context().Wait();
-      d_x->Resize(in_dims);
-    } else {
-      d_x->ShareDataWith(*d_out);
-      d_x->Resize(in_dims);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index f440058e8db2024f5c8a0129db3af87a80d6e551..733157ea05ed39434b9a750e3a94ea548f512ce6 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 
 namespace paddle {
@@ -135,15 +136,14 @@ class WhileGradOp : public framework::OperatorBase {
         auto &og_inside =
             detail::Ref(cur_scope.Var(inside_og_name),
                         "Cannot find inside gradient %s", inside_og_name);
-        if (og_outside.Type().hash_code() ==
-            typeid(framework::LoDTensor).hash_code()) {
+        if (framework::IsType<framework::LoDTensor>(og_outside.Type())) {
           auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
           auto &inside_tensor =
               detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
           inside_tensor.set_lod(outside_tensor.lod());
           inside_tensor.ShareDataWith(outside_tensor);
-        } else if (og_outside.Type().hash_code() ==
-                   typeid(framework::LoDTensorArray).hash_code()) {
+        } else if (framework::IsType<framework::LoDTensorArray>(
+                       og_outside.Type())) {
           auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
           auto &inside_array =
               detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index a34e4371cccfd1be0d173fa11595e4368eb65b85..70bc9c4e8340b8e02ef2826d828faff3f6d11965 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -113,7 +113,11 @@ template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     bool stat, const Args&... args) {
   if (UNLIKELY(!(stat))) {
+#ifndef REPLACE_ENFORCE_GLOG
     throw std::runtime_error(string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -123,8 +127,12 @@ template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     cudaError_t e, const Args&... args) {
   if (UNLIKELY(e)) {
+#ifndef REPLACE_ENFORCE_GLOG
     throw thrust::system_error(e, thrust::cuda_category(),
                                string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -132,8 +140,12 @@ template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     curandStatus_t stat, const Args&... args) {
   if (stat != CURAND_STATUS_SUCCESS) {
+#ifndef REPLACE_ENFORCE_GLOG
     throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
                                string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -143,8 +155,12 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   if (stat == CUDNN_STATUS_SUCCESS) {
     return;
   } else {
+#ifndef REPLACE_ENFORCE_GLOG
     throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
                              string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -173,7 +189,11 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
     err = "CUBLAS: license error, ";
   }
+#ifndef REPLACE_ENFORCE_GLOG
   throw std::runtime_error(err + string::Sprintf(args...));
+#else
+  LOG(FATAL) << err << string::Sprintf(args...);
+#endif
 }
 
 #ifndef __APPLE__
@@ -183,8 +203,13 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   if (stat == ncclSuccess) {
     return;
   } else {
+#ifndef REPLACE_ENFORCE_GLOG
     throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
                              string::Sprintf(args...));
+#else
+    LOG(FATAL) << platform::dynload::ncclGetErrorString(stat)
+               << string::Sprintf(args...);
+#endif
   }
 }
 #endif  // __APPLE__
@@ -203,6 +228,7 @@ inline void throw_on_error(T e) {
         __FILE__, __LINE__);                                           \
   } while (false)
 
+#ifndef REPLACE_ENFORCE_GLOG
 #define PADDLE_ENFORCE(...)                                             \
   do {                                                                  \
     try {                                                               \
@@ -212,6 +238,9 @@ inline void throw_on_error(T e) {
                                               __FILE__, __LINE__);      \
     }                                                                   \
   } while (false)
+#else
+#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
+#endif
 
 /*
  * Some enforce helpers here, usage:
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index ed99932546446eb877c9701de15e2d37d29b5f88..33fec2c1073819d88d85a8872227adcb9df3e8f4 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <mkldnn.h>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/place.h"
@@ -182,10 +183,11 @@ class MKLDNNHandler {
   }
 
   std::shared_ptr<mkldnn::memory> AcquireMemory(
-      mkldnn::memory::primitive_desc& mpd,
-      mkldnn::memory::primitive_desc& user_mpd,
+      mkldnn::memory::primitive_desc& mpd,       // NOLINT
+      mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      const std::string& suffix, std::vector<mkldnn::primitive>& pipeline) {
+      const std::string& suffix,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     // create reorder primitive if the input format is not the preferred one
     auto local_key = key_ + suffix;
     auto key_reorder_p = key_ + suffix + "reorder_p";
@@ -218,7 +220,7 @@ class MKLDNNHandler {
     return target_memory_p;
   }
 
-  static std::string GetHash(mkldnn::memory::dims& operand_dims,
+  static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
                              const std::string& suffix) {
     auto dims2str = [](const mkldnn::memory::dims& operand_dims) {
       std::string dstr = "";
@@ -227,8 +229,9 @@ class MKLDNNHandler {
       }
       return dstr;
     };
+
     return dims2str(operand_dims) + suffix;
-  };
+  }
 
  protected:
   const MKLDNNDeviceContext& dev_ctx_;
@@ -237,5 +240,15 @@ class MKLDNNHandler {
   bool is_reusing_;
 };
 
+inline mkldnn::memory::format MKLDNNFormatForSize(
+    size_t dims_size, mkldnn::memory::format data_format) {
+  if (dims_size == 1) {
+    return mkldnn::memory::format::x;
+  } else if (dims_size == 2) {
+    return mkldnn::memory::format::nc;
+  }
+  return data_format;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 36d080996831d4ad90d92baeafbe964693e2332a..3191f29fc3e5d2914e4b68be9e94ccc0d05f8f93 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -493,7 +493,8 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
 #ifdef PADDLE_WITH_DISTRIBUTE
-      .def("complete", &Executor::Complete)
+      .def("begin_pass", &Executor::BeginPass)
+      .def("end_pass", &Executor::EndPass)
 #endif
       .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
                      int block_id, bool create_local_scope, bool create_vars) {
@@ -643,7 +644,11 @@ All parameter, weight, gradient are variables in Paddle.
           [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
           [](BuildStrategy &self, const std::string &path) {
             self.debug_graphviz_path_ = path;
-          });
+          })
+      .def_property(
+          "enable_data_balance",
+          [](const BuildStrategy &self) { return self.enable_data_balance_; },
+          [](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; });
 
   pe.def(py::init<const std::vector<platform::Place> &,
                   const std::unordered_set<std::string> &,
diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
deleted file mode 100644
index 2dc931c5d7e727679d435470544e60f9b5ce2bde..0000000000000000000000000000000000000000
--- a/paddle/function/BufferArg.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-
-#include "BufferArg.h"
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-
-const SequenceArg& BufferArg::sequence() const {
-  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
-  return dynamic_cast<const SequenceArg&>(*this);
-}
-
-const SparseMatrixArg& BufferArg::sparse() const {
-  CHECK_EQ(bufferType_, TENSOR_SPARSE);
-  return dynamic_cast<const SparseMatrixArg&>(*this);
-}
-
-SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
-    : BufferArg(sparse, argType),
-      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
-      nnz_(sparse.getElementCnt()),
-      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
-      type_(static_cast<SparseDataType>(sparse.getValueType())) {
-  bufferType_ = TENSOR_SPARSE;
-}
-
-SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
-    : BufferArg(sparse, argType),
-      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
-      nnz_(sparse.getElementCnt()),
-      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
-      type_(static_cast<SparseDataType>(sparse.getValueType())) {
-  bufferType_ = TENSOR_SPARSE;
-}
-
-}  // namespace paddle
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
deleted file mode 100644
index 6de8c94e778c8d1439b2a2aa3c581a5a3cf70261..0000000000000000000000000000000000000000
--- a/paddle/function/BufferArg.h
+++ /dev/null
@@ -1,364 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-
-#include "TensorShape.h"
-#include "TensorType.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-enum BufferType {
-  TENSOR_UNKNOWN = 0,
-  TENSOR_NORMAL = 1,
-  TENSOR_SEQUENCE_ID = 2,
-  TENSOR_SEQUENCE_DATA = 3,
-  TENSOR_SPARSE = 4
-};
-
-class BufferArg;
-class SequenceArg;
-class SparseMatrixArg;
-
-/**
- * \brief BufferArg used as the argument type of Function.
- *
- * The arguments of the Paddle Function have four Buffer types.
- * 1. BufferArg for a dense Buffer of any dimension.
- * 2. SequenceIdArg for a Buffer of sequence start positions.
- * 3. SequenceArg for a Buffer of sequence data.
- * 4. SparseMatrixArg for a Buffer of sparse matrix.
- *
- * Buffer shape
- * For most buffers, the first dimension `shape()[0]` represents
- * the size of the mini-batch.
- *
- * Buffer argType
- * There is an ArgType property for the BufferArg used as Function Output.
- * Whether the result of the Function calculation is assigned to the
- * output Buffer or added to the output Buffer is determined by the
- * argType_ property of the output BufferArg.
- */
-
-// ArgType is only used by output BufferArg.
-// For input argument, argType_ is ignored.
-// For output argument, need to set the argType_ of the BufferArg.
-enum ArgType {
-  UNSPECIFIED = 0,
-  ASSIGN_TO = 1,
-  ADD_TO = 2,
-};
-class BufferArg {
- public:
-  void setArgType(ArgType argType) { argType_ = argType; }
-
-  ArgType getArgType() const { return argType_; }
-
- public:
-  BufferArg(ValueType valueType,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(nullptr), valueType_(valueType), shape_(shape), argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(void* buf,
-            ValueType valueType,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(void* buf, ValueType valueType) : buf_(buf), valueType_(valueType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(2),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, matrix.getHeight());
-    shape_.setDim(1, matrix.getWidth());
-  }
-
-  BufferArg(const Matrix& matrix,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(shape),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
-  }
-
-  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(1),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, vector.getSize());
-  }
-
-  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
-        valueType_(VALUE_TYPE_INT32),
-        shape_(1),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, vector.getSize());
-  }
-
-  template <DeviceType DType>
-  typename Tensor<real, DType>::Matrix matrix() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<real>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ((size_t)2, shape_.ndims());
-    return typename Tensor<real, DType>::Matrix(
-        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
-  }
-
-  template <typename VType, DeviceType DType>
-  typename Tensor<VType, DType>::Vector vector() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<VType>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ((size_t)1, shape_.ndims());
-    return typename Tensor<VType, DType>::Vector(
-        shape_[0], reinterpret_cast<VType*>(buf_));
-  }
-
-  virtual ~BufferArg() {}
-
-  template <typename T>
-  T* data() const {
-    return reinterpret_cast<T*>(buf_);
-  }
-
-  void* data() const { return buf_; }
-  ValueType valueType() const { return valueType_; }
-  BufferType bufferType() const { return bufferType_; }
-  const TensorShape& shape() const { return shape_; }
-  bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
-  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
-  virtual size_t numElements() const { return shape_.getElements(); }
-
-  const SequenceArg& sequence() const;
-  const SparseMatrixArg& sparse() const;
-
- protected:
-  void* buf_;
-  ValueType valueType_;
-  TensorShape shape_;
-  BufferType bufferType_{TENSOR_UNKNOWN};
-  ArgType argType_{UNSPECIFIED};
-  // TODO(tianbing), add deviceType_
-  // leading dimensions. The size is dims_.size()
-  // Dims lds_;
-};
-
-// sequence start positions in a mini-batch of sequences
-// shape_.ndims() == 1
-// valueType_ = int32
-// if a < b then value_.buf_[a] < value_.buf_[b]
-class SequenceIdArg : public BufferArg {
- public:
-  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
-      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    CHECK_EQ(shape_.ndims(), 1UL);
-    CHECK_GE(shape_[0], 1UL);
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  SequenceIdArg(void* buf,
-                const TensorShape& shape,
-                ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    CHECK_EQ(shape_.ndims(), 1UL);
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  ~SequenceIdArg() {}
-
-  size_t numSeqs() const { return numSeqs_; }
-
- private:
-  size_t numSeqs_;
-};
-
-// sequences data
-// For mini-batch calculate,
-// one batch can contain more than one sequence of data.
-// SequenceArg can be used to represent sequences that contain multiple
-// unequal lengths.
-class SequenceArg : public BufferArg {
- public:
-  SequenceArg(ValueType valueType,
-              const TensorShape& shape,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(valueType, shape, argType),
-        startPositions_(TensorShape({shape[0]})) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  SequenceArg(void* buf,
-              ValueType valueType,
-              const TensorShape& shape,
-              const SequenceIdArg& startPositions,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, valueType, shape, argType),
-        startPositions_(startPositions) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  SequenceArg(const Matrix& matrix,
-              const IVector& vector,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(matrix, argType), startPositions_(vector) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  ~SequenceArg() {}
-
-  void* getIdBuf() const { return startPositions_.data(); }
-  size_t numSeqs() const { return startPositions_.numSeqs(); }
-  SequenceIdArg& getSequenceId() { return startPositions_; }
-  const SequenceIdArg& getSequenceId() const { return startPositions_; }
-
- private:
-  SequenceIdArg startPositions_;
-};
-
-// sparse matrix
-// valueType_ == float or double
-// shape_.ndims() == 2
-class SparseMatrixArg : public BufferArg {
- public:
-  SparseMatrixArg(void* buf,
-                  ValueType valueType,
-                  const TensorShape& shape,
-                  const BufferArg& row,
-                  const BufferArg& col,
-                  size_t nnz,
-                  SparseFormat format,
-                  SparseValueType type,
-                  ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, valueType, shape, argType),
-        row_(row),
-        col_(col),
-        nnz_(nnz),
-        format_(static_cast<SparseDataFormat>(format)),
-        type_(static_cast<SparseDataType>(type)) {
-    bufferType_ = TENSOR_SPARSE;
-    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
-    CHECK_EQ(shape_.ndims(), 2UL);
-    CHECK_EQ(row_.shape().ndims(), 1UL);
-    CHECK_EQ(col_.shape().ndims(), 1UL);
-    if (format_ == T_SPARSE_CSR) {
-      CHECK_EQ(nnz, col.shape()[0]);
-    } else if (format_ == T_SPARSE_CSC) {
-      CHECK_EQ(nnz, row.shape()[0]);
-    }
-  }
-
-  SparseMatrixArg(ValueType valueType,
-                  const TensorShape& shape,
-                  size_t nnz,
-                  SparseFormat format,
-                  SparseValueType type,
-                  ArgType argType = UNSPECIFIED)
-      : BufferArg(valueType, shape, argType),
-        row_(BufferArg(nullptr, VALUE_TYPE_INT32)),
-        col_(BufferArg(nullptr, VALUE_TYPE_INT32)),
-        nnz_(nnz),
-        format_(static_cast<SparseDataFormat>(format)),
-        type_(static_cast<SparseDataType>(type)) {
-    bufferType_ = TENSOR_SPARSE;
-    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
-    CHECK_EQ(shape_.ndims(), 2UL);
-
-    /// len of row_ : height + 1 (CSR) or nnz (CSC), buf_ == nullptr
-    row_ = (format_ == T_SPARSE_CSR
-                ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[0] + 1})
-                : BufferArg(VALUE_TYPE_INT32, TensorShape{nnz}));
-    /// len of col_ :  width + 1 (CSC) or nnz (CSR), buf_ == nullptr
-    col_ = (format_ == T_SPARSE_CSR
-                ? BufferArg(VALUE_TYPE_INT32, TensorShape{nnz})
-                : BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[1] + 1}));
-  }
-
-  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
-
-  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
-
-  template <DeviceType DType>
-  typename Tensor<real, DType>::SparseMatrix SparseMatrix() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<real>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ(2UL, shape_.ndims());
-    return typename Tensor<real, DType>::SparseMatrix(
-        reinterpret_cast<real*>(buf_),
-        reinterpret_cast<int*>(row_.data()),
-        reinterpret_cast<int*>(col_.data()),
-        shape_[0],
-        shape_[1],
-        nnz_,
-        static_cast<SparseValueType>(type_),
-        static_cast<SparseFormat>(format_),
-        false);
-  }
-
-  ~SparseMatrixArg() {}
-
-  void* getRowBuf() const { return row_.data(); }
-
-  void* getColBuf() const { return col_.data(); }
-
-  size_t nnz() const { return nnz_; }
-
-  size_t numElements() const override { return nnz_; }
-
-  SparseDataFormat dataFormat() const { return format_; }
-
-  SparseDataType dataType() const { return type_; }
-
- private:
-  BufferArg row_;
-  BufferArg col_;
-  size_t nnz_;
-  SparseDataFormat format_;
-  SparseDataType type_;
-};
-
-}  // namespace paddle
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
deleted file mode 100644
index 1a6e0110afb64c8b4f164d71e31e5f9bfcdee4a8..0000000000000000000000000000000000000000
--- a/paddle/function/BufferArgTest.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BufferArg.h"
-#include <gtest/gtest.h>
-#include "paddle/math/MemoryHandle.h"
-
-namespace paddle {
-
-TEST(BufferTest, BufferArg) {
-  TensorShape shape({8, 10});
-  CpuMemoryHandle memory(shape.getElements() *
-                         sizeOfValuType(VALUE_TYPE_FLOAT));
-  BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
-  EXPECT_EQ(buffer.data(), memory.getBuf());
-}
-
-TEST(BufferTest, SequenceIdArg) {
-  TensorShape shape({10});
-  CpuMemoryHandle memory(shape.getElements() *
-                         sizeOfValuType(VALUE_TYPE_INT32));
-  SequenceIdArg buffer(memory.getBuf(), shape);
-  EXPECT_EQ(buffer.data(), memory.getBuf());
-  EXPECT_EQ(buffer.numSeqs(), 9U);
-}
-
-}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
deleted file mode 100644
index 1187842452460ac3fd71f48150fab6467f93dc6c..0000000000000000000000000000000000000000
--- a/paddle/function/ContextProjectionOp.cpp
+++ /dev/null
@@ -1,412 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ContextProjectionOp.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-/**
- * Context Projection Forward with CPU Matrix Device.
- *
- */
-template <>
-void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
-                                               const CpuMatrix& input_mat,
-                                               const CpuMatrix& weight_mat,
-                                               const CpuIVector& seq_vec,
-                                               size_t context_length,
-                                               int context_start,
-                                               size_t begin_pad) {
-  const int* starts = seq_vec.getData();
-  const size_t num_sequences = seq_vec.getSize() - 1;
-  for (size_t i = 0; i < num_sequences; ++i) {
-    for (size_t j = 0; j < context_length; ++j) {
-      int begin = starts[i] + context_start + j;
-      int end = starts[i + 1] + context_start + j;
-      int dst_begin = starts[i];
-      int dst_end = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t pad_size =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
-        if (weight_mat) {
-          MatrixPtr sub =
-              const_cast<CpuMatrix&>(weight_mat).subMatrix(j, pad_size);
-          mat->addAtOffset(*sub, j * input_mat.getWidth());
-        }
-        dst_begin = starts[i] + pad_size;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t pad_size =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
-        if (weight_mat) {
-          MatrixPtr sub =
-              const_cast<CpuMatrix&>(weight_mat)
-                  .subMatrix(begin_pad + context_start + j - pad_size,
-                             pad_size);
-          mat->addAtOffset(*sub, j * input_mat.getWidth());
-        }
-        dst_end = starts[i + 1] - pad_size;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      MatrixPtr src =
-          const_cast<CpuMatrix&>(input_mat).subMatrix(begin, end - begin);
-      MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
-      dst->addAtOffset(*src, j * input_mat.getWidth());
-    }
-  }
-}
-
-/**
- * Paddle Function for Context Projection Forward.
- * Calculate the output layer value sequence after context projection.
- *
- * What is Context Projection for a sequence?
- * For example, assumed input (x) has 4 words and the dimension of each word
- * representation is 2. If we use zero to pad instead of learned weight to pad,
- * and the context_lenth is 3, the output (y) is:
- *
- * @code
- *  x = [a1, a2;
- *       b1, b2;
- *       c1, c2;
- *       d1, d2]
- *  y = [0,  0,  a1, a2, b1, b2;
- *       a1, a2, b1, b2, c1, c2;
- *       b1, b2, c1, c2, d1, d2;
- *       c1, c2, d1, d2, 0,  0]
- * @endcode
- *
- * \param outputs[0].matrix   output layer value, n * (d * l)
- * \param outputs[0].vector   start position sequence, n * 1
- * \param inputs[0].matrix    input layer value, n * d
- * \param inputs[0].vector    start position sequence, n * 1
- * \param inputs[1].matrix    input layer weight, pad * d
- */
-template <DeviceType Device>
-class ContextProjectionForwardFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK(1UL == inputs.size() || 2UL == inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-
-    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(val_seqs.shape().ndims(), 2UL);
-    /// dim of output = dim of input * context_length
-    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
-    /// input and output has the same batch_size
-    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
-    if (2UL == inputs.size()) {
-      CHECK_EQ(inputs[1].shape().ndims(), 2UL);
-      /// dim of input == dim of weight
-      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
-    }
-
-    CHECK_EQ(out_seq.getArgType(), ADD_TO);
-    auto out_mat = out_seq.matrix<Device>();
-    const auto in_mat = val_seqs.matrix<Device>();
-    const auto w_mat =
-        (2UL == inputs.size() && inputs[1].data())
-            ? inputs[1].matrix<Device>()
-            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
-
-    ContextProjectionForward<Device>(out_mat,
-                                     in_mat,
-                                     w_mat,
-                                     seq_vec,
-                                     context_length_,
-                                     context_start_,
-                                     begin_pad_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-};
-
-/**
- * Context Projection Backward with CPU Matrix Device.
- *
- */
-template <>
-void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
-                                                CpuMatrix& in_grad_mat,
-                                                CpuMatrix& w_grad_mat,
-                                                const CpuIVector& seq_vec,
-                                                size_t context_length,
-                                                int context_start,
-                                                size_t begin_pad,
-                                                bool is_padding,
-                                                size_t total_pad) {
-  size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
-                                 : w_grad_mat ? w_grad_mat.getWidth() : 0;
-  const int* starts = seq_vec.getData();
-  size_t num_sequences = seq_vec.getSize() - 1;
-  for (size_t i = 0; i < num_sequences; ++i) {
-    for (size_t j = 0; j < context_length; ++j) {
-      int begin = starts[i] + context_start + j;
-      int end = starts[i + 1] + context_start + j;
-      int dst_begin = starts[i];
-      int dst_end = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t pad_size =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        if (is_padding && w_grad_mat) {
-          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
-                              .subMatrix(starts[i], pad_size);
-          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
-          sub->addAtOffset(*mat, j * input_dim);
-        }
-        dst_begin = starts[i] + pad_size;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t pad_size =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        if (is_padding && w_grad_mat) {
-          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
-                              .subMatrix(starts[i + 1] - pad_size, pad_size);
-          MatrixPtr sub = w_grad_mat.subMatrix(
-              begin_pad + context_start + j - pad_size, pad_size);
-          sub->addAtOffset(*mat, j * input_dim);
-        }
-        dst_end = starts[i + 1] - pad_size;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      if (!in_grad_mat) continue;
-      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
-      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
-                          .subMatrix(dst_begin, dst_end - dst_begin);
-      src->addAtOffset(*dst, j * input_dim);
-    }
-  }
-}
-
-/**
- * Context Projection Backward Function.
- * Update the weight gradient and input layer gradient with backprop
- *
- * \param inputs[0].matrix          output layer grad, n * (d * l)
- * \param inputs[0].vector          start position sequence, n * 1
- * \param outputs[0].matrix         input layer grad, n * d
- * \param outputs[0].vector         start position sequence, n * 1
- * \param outputs[1]                weight grad, pad * d
- */
-template <DeviceType Device>
-class ContextProjectionBackwardFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-    is_padding_ = config.get<bool>("is_padding");
-    total_pad_ = config.get<size_t>("total_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK(1UL == outputs.size() || 2UL == outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-    CHECK(in_seq.data() && in_seq.getSequenceId().data());
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), 1UL);
-
-    /// input and output grad has the same batch_size
-    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
-    /// dim of output grad = dim of input grad * context_length
-    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
-    CHECK_EQ(out_seq.getArgType(), ADD_TO);
-
-    if (2UL == outputs.size()) {
-      CHECK_EQ(outputs[1].shape().ndims(), 2UL);
-      /// dim of input grad == dim of weight
-      CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
-      CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-    }
-
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    auto in_grad_mat =
-        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                        : out_seq.matrix<Device>();
-    auto w_grad_mat =
-        (2UL == outputs.size() && outputs[1].data())
-            ? outputs[1].matrix<Device>()
-            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-
-    ContextProjectionBackward<Device>(out_grad_mat,
-                                      in_grad_mat,
-                                      w_grad_mat,
-                                      seq_vec,
-                                      context_length_,
-                                      context_start_,
-                                      begin_pad_,
-                                      is_padding_,
-                                      total_pad_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-  bool is_padding_;
-  size_t total_pad_;
-};
-
-/**
- * Context Projection Backward Data Function
- * Update input layer grad
- * input:  sequence of output layer grad
- * output: sequence of input layer grad
- *
- * \param outputs[0].matrix              input layer grad, n * d
- * \param outputs[0].vector              start position sequence, n * 1
- * \param inputs[0].matrix               output layer grad, n * (d * l)
- * \param inputs[0].vector               start positon sequence, n * 1
- */
-template <DeviceType Device>
-class ContextProjectionBackwardDataFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-
-    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
-    /// output layer grad dim == input layer grad dim * context_length_
-    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
-    /// input and output has the same batch_size
-    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    auto in_grad_mat = out_seq.matrix<Device>();
-
-    ContextProjectionBackwardData<Device>(
-        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-};
-
-/**
- * Context Projection Backward Weight Function
- * Update weight grad by backprop
- * input:  sequence of output layer grad
- * output: weight grad
- *
- * \param outputs[0]                   weight grad, pad * d
- * \param inputs[0].matrix             output layer grad, n * (d * l)
- * \param inputs[0].vecotr             start positon sequence, n * 1
- */
-template <DeviceType Device>
-class ContextProjectionBackwardWeightFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-    total_pad_ = config.get<size_t>("total_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
-    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
-    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
-    /// output layer grad dim == weight dim * context_length_
-    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    auto w_grad_mat = outputs[0].matrix<Device>();
-    ContextProjectionBackwardWeight<Device>(out_grad_mat,
-                                            w_grad_mat,
-                                            seq_vec,
-                                            context_length_,
-                                            context_start_,
-                                            total_pad_,
-                                            begin_pad_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-  size_t total_pad_;
-};
-
-REGISTER_TYPED_FUNC(ContextProjectionForward,
-                    CPU,
-                    ContextProjectionForwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackward,
-                    CPU,
-                    ContextProjectionBackwardFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(ContextProjectionForward,
-                    GPU,
-                    ContextProjectionForwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackward,
-                    GPU,
-                    ContextProjectionBackwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
-                    GPU,
-                    ContextProjectionBackwardDataFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
-                    GPU,
-                    ContextProjectionBackwardWeightFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
deleted file mode 100644
index d805c3ae927321fc74946e202b98401b6b3cd0f7..0000000000000000000000000000000000000000
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-void testMatrixProjectionForward(int context_start,
-                                 size_t context_length,
-                                 bool is_padding,
-                                 size_t batch_size,
-                                 size_t input_dim) {
-  size_t pad = std::max(0, -context_start) +
-               std::max(0, (int)(context_start + context_length - 1));
-  if (pad == 0) is_padding = false;
-
-  CpuGpuFuncCompare test(
-      "ContextProjectionForward",
-      FuncConfig()
-          .set("context_length", context_length)
-          .set("context_start", context_start)
-          .set("begin_pad", (size_t)std::max(0, -context_start)));
-
-  // prepare input arguments
-  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
-  test.addInputs(
-      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}));
-  if (is_padding) {  // weight
-    test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}));
-  }
-  test.addOutputs(
-      SequenceArg(VALUE_TYPE_FLOAT,
-                  TensorShape{batch_size, input_dim * context_length}),
-      ADD_TO);
-
-  // run Function
-  test.run();
-}
-
-void testMatrixProjectionBackward(int context_start,
-                                  size_t context_length,
-                                  bool is_padding,
-                                  size_t batch_size,
-                                  size_t input_dim) {
-  size_t pad = std::max(0, -context_start) +
-               std::max(0, (int)(context_start + context_length - 1));
-  if (pad == 0) is_padding = false;
-
-  CpuGpuFuncCompare test(
-      "ContextProjectionBackward",
-      FuncConfig()
-          .set("context_length", context_length)
-          .set("context_start", context_start)
-          .set("begin_pad", (size_t)std::max(0, -context_start))
-          .set("is_padding", is_padding)
-          .set("total_pad", pad));
-
-  // prepare input arguments
-  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
-  test.addInputs(SequenceArg(
-      VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim * context_length}));
-  test.addOutputs(
-      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}),
-      ADD_TO);
-  if (is_padding) {  // weight
-    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}),
-                    ADD_TO);
-  }
-
-  // run Function
-  test.run();
-}
-
-TEST(ContextProjection, Projection) {
-  for (auto context_start : {-5, -3, -1, 0, 3}) {
-    for (auto context_length : {1, 2, 5, 7}) {
-      for (auto trainable_padding : {false, true}) {
-        for (auto batch_size : {1, 2, 5, 20, 100}) {
-          for (auto input_dim : {15, 32, 63, 128, 200}) {
-            VLOG(3) << " context_start=" << context_start
-                    << " context_length=" << context_length
-                    << " trainable_padding=" << trainable_padding
-                    << " batch_size=" << batch_size
-                    << " input_dim=" << input_dim;
-            testMatrixProjectionForward(context_start,
-                                        context_length,
-                                        trainable_padding,
-                                        batch_size,
-                                        input_dim);
-            testMatrixProjectionBackward(context_start,
-                                         context_length,
-                                         trainable_padding,
-                                         batch_size,
-                                         input_dim);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
deleted file mode 100644
index 2c25e1af44965d30591faeccc9a181e36c7e0a0f..0000000000000000000000000000000000000000
--- a/paddle/function/CosSimOp.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CosSimOp.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-/**
- * Cosine Similarity for CpuMatrix
- *
- * \param out_mat, output value, size: nSamples * 1.
- * \param in1_mat, input value 1, size: nSamples * dim.
- * \param in2_mat, input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param scale, default 1.0
- *
- */
-template <>
-void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
-                                    const CpuMatrix& in1_mat,
-                                    const CpuMatrix& in2_mat,
-                                    real scale) {
-  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
-  size_t num_samples = out_mat.getHeight();
-  size_t dim = in1_mat.getWidth();
-  /// column vector [nSamples, 1]
-  real* out = out_mat.getData();
-  const real* x = in1_mat.getData();
-  const real* y = in2_mat.getData();
-
-  /// in2 might only have one row or full rows
-  CHECK(in2_mat.getHeight() == 1LU || in2_mat.getHeight() == num_samples);
-  size_t inc = (in2_mat.getHeight() == 1LU) ? 0 : dim;
-  for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
-    real square_sum_x = 0;
-    real square_sum_y = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      square_sum_x += x[j] * x[j];
-      square_sum_y += y[j] * y[j];
-      xy += x[j] * y[j];
-    }
-    CHECK(square_sum_x > 0 && square_sum_y > 0);
-    out[i] = scale * xy / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
-  }
-}
-
-/**
- * Cosine Similarity
- * for each row i,
- *   out[i] = scale * cos(input1[i], input2[i])
- *      = scale * <input1[i], input2[i]>/sqrt(|input1[i]|^2 * |input2[i]|^2)
- * when input2 only has one row, then for each row i,
- *   out[i] = cos(input1[i], input2[0])
- *
- * \param inputs[0] input matrix 1, size: nSamples * dim.
- * \param inputs[1] input matrix 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param outputs[0] output matrix, size : nSamples * 1.
- */
-
-template <DeviceType Device>
-class CosSimForwardFunc : public FunctionBase {
-  void init(const FuncConfig& config) override {
-    scale_ = config.get<real>("scale");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(inputs.size(), 2UL);
-    CHECK_EQ(outputs.size(), 1UL);
-
-    CHECK_EQ(inputs[0].shape().ndims(), 2UL);
-    CHECK_EQ(inputs[1].shape().ndims(), 2UL);
-    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
-
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
-    CHECK_EQ(outputs[0].shape()[1], 1UL);
-
-    CHECK(outputs[0].data() && inputs[0].data() && inputs[1].data());
-
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    auto out_mat = outputs[0].matrix<Device>();
-    const auto in1_mat = inputs[0].matrix<Device>();
-    const auto in2_mat = inputs[1].matrix<Device>();
-
-    CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
-  }
-
- private:
-  real scale_;
-};
-
-/**
- * Cosine Similarity Derivative for CpuMatrix
- *
- * \param in1_grad  forward input grad 1, size: nSamples * dim.
- * \param in2_grad  forward input grad 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- *
- * \param out_grad  backward loss output grad, size : nSamples * 1.
- * \param out_val   forward output value, size: nSamples * 1.
- * \param in1_val   forward input value 1, size: nSamples * dim.
- * \param in2_val   forward input value 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param scale,    default 1.0
- */
-template <>
-void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad,
-                                     const CpuMatrix& out_val,
-                                     const CpuMatrix& in1_val,
-                                     const CpuMatrix& in2_val,
-                                     CpuMatrix& in1_grad,
-                                     CpuMatrix& in2_grad,
-                                     real scale) {
-  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
-        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
-  CHECK_EQ(out_val.useGpu_, false) << "Matrix type are GPU, CPU required";
-
-  const real* grad = out_grad.getData();
-  const real* out = out_val.getData();
-  const real* prev_out_x = in1_val.getData();
-  const real* prev_out_y = in2_val.getData();
-  real* prev_grad_x = in1_grad.getData();
-  real* prev_grad_y = in2_grad.getData();
-
-  size_t num_samples = out_grad.getHeight();
-  size_t dim = in1_val.getWidth();
-  CHECK_EQ(in2_val.getHeight(), in2_grad.getHeight());
-  CHECK(in2_val.getHeight() == 1LU || in2_val.getHeight() == num_samples);
-  size_t inc = (in2_val.getHeight() == 1LU) ? 0 : dim;
-  for (size_t i = 0; i < num_samples; ++i,
-              prev_out_x += dim,
-              prev_out_y += inc,
-              prev_grad_x += dim,
-              prev_grad_y += inc) {
-    real square_sum_x = 0;
-    real square_sum_y = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      square_sum_x += prev_out_x[j] * prev_out_x[j];
-      square_sum_y += prev_out_y[j] * prev_out_y[j];
-      xy += prev_out_x[j] * prev_out_y[j];
-    }
-    CHECK(square_sum_x > 0 && square_sum_y > 0);
-    if (xy == 0) {
-      real reciprocal =
-          1.0f / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
-      for (size_t j = 0; j < dim; ++j) {
-        prev_grad_x[j] += scale * grad[i] * prev_out_y[j] * reciprocal;
-        prev_grad_y[j] += scale * grad[i] * prev_out_x[j] * reciprocal;
-      }
-    } else {
-      real reciprocal_xy = 1.0f / xy;
-      real reciprocal_square_sum_x = 1.0f / square_sum_x;
-      real reciprocal_square_sum_y = 1.0f / square_sum_y;
-      for (size_t j = 0; j < dim; ++j) {
-        prev_grad_x[j] +=
-            out[i] * grad[i] * (prev_out_y[j] * reciprocal_xy -
-                                prev_out_x[j] * reciprocal_square_sum_x);
-        prev_grad_y[j] +=
-            out[i] * grad[i] * (prev_out_x[j] * reciprocal_xy -
-                                prev_out_y[j] * reciprocal_square_sum_y);
-      }
-    }
-  }
-}
-
-/**
- * Cosine Similarity backward Derivative
- *
- * \param outputs[0] forward input grad 1, size: nSamples * dim.
- * \param outputs[1] forward input grad 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- *
- * \param inputs[0] backward loss output grad, size : nSamples * 1.
- * \param inputs[1] forward output value, size: nSamples * 1.
- * \param inputs[2] forward input value 1, size: nSamples * dim.
- * \param inputs[3] forward input value 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- */
-template <DeviceType Device>
-class CosSimBackwardFunc : public FunctionBase {
-  void init(const FuncConfig& config) override {
-    scale_ = config.get<real>("scale");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(inputs.size(), 4UL);
-    CHECK_EQ(outputs.size(), 2UL);
-    /// dim of out_grad and out_val == 1, column vector
-    CHECK_EQ(inputs[0].shape()[1], 1UL);
-    CHECK_EQ(inputs[1].shape()[1], 1UL);
-    /// nSamples of out_grad == out_val == in_val1 == in_grad1
-    CHECK_EQ(inputs[1].shape()[0], inputs[0].shape()[0]);
-    CHECK_EQ(inputs[0].shape()[0], inputs[0].shape()[0]);
-    CHECK_EQ(outputs[0].shape()[0], inputs[0].shape()[0]);
-    /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
-    CHECK_EQ(inputs[3].shape()[1], inputs[2].shape()[1]);
-    CHECK_EQ(outputs[0].shape()[1], inputs[2].shape()[1]);
-    CHECK_EQ(outputs[1].shape()[1], inputs[2].shape()[1]);
-
-    CHECK(inputs[0].data() && inputs[1].data() && inputs[2].data() &&
-          inputs[3].data() && outputs[0].data() && outputs[1].data());
-
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-
-    const auto out_grad = inputs[0].matrix<Device>();
-    const auto out_val = inputs[1].matrix<Device>();
-    const auto in1_val = inputs[2].matrix<Device>();
-    const auto in2_val = inputs[3].matrix<Device>();
-    auto in1_grad = outputs[0].matrix<Device>();
-    auto in2_grad = outputs[1].matrix<Device>();
-
-    CosSimBackward<Device>(
-        out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
-  }
-
- private:
-  real scale_;
-};
-
-REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
-REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
-REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/function/CosSimOpTest.cpp b/paddle/function/CosSimOpTest.cpp
deleted file mode 100644
index 42b02da0cb07a57e030a3edb08bea23203efd688..0000000000000000000000000000000000000000
--- a/paddle/function/CosSimOpTest.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/math/Matrix.h"
-
-using namespace paddle;  // NOLINT
-
-void testCosSimForward(size_t height_x,
-                       size_t height_y,
-                       size_t width,
-                       real scale) {
-  CpuGpuFuncCompare test("CosSimForward", FuncConfig().set("scale", scale));
-  // prepare input arguments
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}),
-                  ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-void testCosSimBackward(size_t height_x,
-                        size_t height_y,
-                        size_t width,
-                        real scale) {
-  CpuGpuFuncCompare test("CosSimBackward", FuncConfig().set("scale", scale));
-  // prepare input arguments
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}),
-                  ADD_TO);
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}),
-                  ADD_TO);
-  // run Function
-  test.run();
-}
-
-TEST(Matrix, cosSim) {
-  for (auto height_x : {10, 100, 1000}) {
-    for (auto height_y : {1, height_x}) {
-      for (auto width : {10, 100, 1000}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSimForward(height_x, height_y, width, scale);
-          testCosSimBackward(height_x, height_y, width, scale);
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/function/CropOp.cpp b/paddle/function/CropOp.cpp
deleted file mode 100644
index 5bd98910fe838751935f8ef2387ce96e755c6df1..0000000000000000000000000000000000000000
--- a/paddle/function/CropOp.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CropOp.h"
-#include "paddle/function/TensorShape.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void Crop<DEVICE_TYPE_CPU>(real* outputs,
-                           const real* inputs,
-                           const TensorShape inShape,
-                           const TensorShape outShape,
-                           const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner =
-      conf.get<std::vector<uint32_t>>("crop_corner");
-  int cCrop = crop_corner[1];
-  int hCrop = crop_corner[2];
-  int wCrop = crop_corner[3];
-
-  int num = inShape[0];
-  int inC = inShape[1];
-  int inH = inShape[2];
-  int inW = inShape[3];
-
-  int outC = outShape[1];
-  int outH = outShape[2];
-  int outW = outShape[3];
-
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < outC; c++) {
-      for (int h = 0; h < outH; h++) {
-        int outoff = ((n * outC + c) * outH + h) * outW;
-        int inoff = ((n * inC + c + cCrop) * inH + h + hCrop) * inW + wCrop;
-        memcpy(outputs + outoff, inputs + inoff, outW * sizeof(real));
-      }
-    }
-  }
-}
-
-template <>
-void CropGrad<DEVICE_TYPE_CPU>(const real* inGrad,
-                               real* outGrad,
-                               const TensorShape inShape,
-                               const TensorShape outShape,
-                               const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner =
-      conf.get<std::vector<uint32_t>>("crop_corner");
-  int cCrop = crop_corner[1];
-  int hCrop = crop_corner[2];
-  int wCrop = crop_corner[3];
-
-  int num = outShape[0];
-  int outC = outShape[1];
-  int outH = outShape[2];
-  int outW = outShape[3];
-
-  int inC = inShape[1];
-  int inH = inShape[2];
-  int inW = inShape[3];
-
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int outoff = ((n * outC + c + cCrop) * outH + h + hCrop) * outW + wCrop;
-        int inoff = ((n * inC + c) * inH + h) * inW;
-        CpuVector inG = CpuVector(inW, const_cast<real*>(inGrad + inoff));
-        CpuVector outG = CpuVector(inW, outGrad + outoff);
-        outG += inG;
-      }
-    }
-  }
-}
-
-/**
- * \brief Crop input according to the specify corner and shape.
- *        The input and output is a 4D tensor. In CropFunc, we only
- *        crop the 2nd to 4th dimension.
- *
- * Argument in this Function:
- * \param pad_    A struct object contains the cropping corner and shape.
- * \param inputs  A 4D tensor, only one input.
- * \param outputs A 4D tensor, the output value after cropping.
- *
- * For example,
- * Input(2,2,2,3) = [
- *                    [ [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]] ],
- *                    [ [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]] ]
- *                  ] # the input shape is (2,2,2,3)
- *
- * pad_: if corner = (0,1,1) and crop_shape = (2,1,2)
- * Output(2,2,1,2) = [
- *                    [ [[4,5]],
- *                      [[6,7]] ],
- *                    [ [[8,7]],
- *                      [[3,5]] ]
- *                  ] # the input shape is (2,2,2,3)
- */
-template <DeviceType Device>
-class CropFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    TensorShape inShape = inputs[0].shape();
-    TensorShape outShape = outputs[0].shape();
-
-    Crop<Device>(outputs[0].data<real>(),
-                 inputs[0].data<real>(),
-                 inShape,
-                 outShape,
-                 conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-/**
- * \brief The backward propagation of cropping Function.
- *
- * Argument in this Function:
- * \param crop_    The same meaning as it in CropFunc.
- * \param inputs  The gradient with respect to the output value of CropFunc.
- * \param outputs The gradient with respect to the input value of CropFunc.
- */
-
-template <DeviceType Device>
-class CropGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    TensorShape outShape = outputs[0].shape();
-    TensorShape inShape = inputs[0].shape();
-
-    CropGrad<Device>(inputs[0].data<real>(),
-                     outputs[0].data<real>(),
-                     inShape,
-                     outShape,
-                     conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
-REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
-REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
deleted file mode 100644
index 7ff9227e5c2702d9d5334db501730b57ec10bfe3..0000000000000000000000000000000000000000
--- a/paddle/function/CrossMapNormalOp.cpp
+++ /dev/null
@@ -1,344 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CrossMapNormalOp.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void CrossMapNormal<DEVICE_TYPE_CPU>(real* outputs,
-                                     real* denoms,
-                                     const real* inputs,
-                                     size_t numSamples,
-                                     size_t channels,
-                                     size_t height,
-                                     size_t width,
-                                     size_t size,
-                                     real scale,
-                                     real pow) {
-  size_t oneImage = height * width;
-  size_t oneSample = channels * oneImage;
-
-  CpuVector outputsV(numSamples * oneSample, outputs);
-  CpuVector inputsV(numSamples * oneSample, const_cast<real*>(inputs));
-  CpuVector denomsV(numSamples * oneSample, denoms);
-
-  // f(x) = x * ( 1 + scale * SUM((x)^2) )^(-pow)
-  // x represents inputs
-  // f(x) represents outputs
-  // denoms save the intermediate result for backward
-  denomsV = denomsV.constant(1.0);
-  const int start = -((int)size - 1) / 2;
-  const int end = (int)size + start;
-  for (size_t i = 0; i < numSamples; i++) {
-    real* oneDenom = denoms + i * oneSample;
-    real* oneInput = const_cast<real*>(inputs) + i * oneSample;
-    for (int c = 0; c < (int)channels; c++) {
-      CpuVector denom(oneImage, oneDenom + c * oneImage);
-      for (int s = start; s < end; s++) {
-        if (c + s >= 0 && c + s < (int)channels) {
-          CpuVector input(oneImage, oneInput + (c + s) * oneImage);
-          denom += input.square() * scale;
-        }
-      }
-    }
-  }
-
-  outputsV = inputsV * denomsV.pow(-pow);
-}
-
-template <>
-void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
-                                         const real* inputsValue,
-                                         const real* outputsValue,
-                                         const real* outputsGrad,
-                                         const real* denoms,
-                                         size_t numSamples,
-                                         size_t channels,
-                                         size_t height,
-                                         size_t width,
-                                         size_t size,
-                                         real scale,
-                                         real pow) {
-  size_t oneSample = channels * height * width;
-  std::function<CpuVector(real*, size_t)> oneImage = [=](real* data,
-                                                         size_t offset) {
-    return CpuVector(height * width, data + offset);
-  };
-
-  const int start = -((int)size) / 2;
-  const int end = (int)size + start;
-  const real ratio = -(real)2 * scale * pow;
-  for (size_t i = 0; i < numSamples; i++) {
-    size_t sOffset = i * oneSample;
-    real* oneInputGrad = inputsGrad + sOffset;
-    real* oneInputValue = const_cast<real*>(inputsValue) + sOffset;
-    real* oneDenom = const_cast<real*>(denoms) + sOffset;
-    real* oneOutputGrad = const_cast<real*>(outputsGrad) + sOffset;
-    real* oneOutputValue = const_cast<real*>(outputsValue) + sOffset;
-
-    for (int c = 0; c < (int)channels; c++) {
-      size_t cOffset = c * height * width;
-      CpuVector inputGrad = oneImage(oneInputGrad, cOffset);
-      CpuVector inputValue = oneImage(oneInputValue, cOffset);
-      CpuVector denom = oneImage(oneDenom, cOffset);
-      CpuVector outputGrad = oneImage(oneOutputGrad, cOffset);
-
-      inputGrad = inputGrad + denom.pow(-pow) * outputGrad;
-      for (int s = start; s < end; s++) {
-        if (c + s >= 0 && c + s < (int)channels) {
-          size_t offset = (c + s) * height * width;
-          CpuVector output = oneImage(oneOutputValue, offset);
-          CpuVector outputGrad = oneImage(oneOutputGrad, offset);
-          CpuVector denom = oneImage(oneDenom, offset);
-
-          inputGrad += ((outputGrad * output * ratio) / denom) * inputValue;
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief Normalization with across maps.
- *
- * This Function comes from the paper
- * "ImageNet Classification with Deep Convolutional Neural Networks".
- *
- * The original formula is:
- *
- *                                Input(i, x, y)
- * Output(i, x, y) = ----------------------------------------------
- *                                 -- upper
- *                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
- *                                 -- j = lower
- *
- * upper is `min(C, c + N/2)`
- * lower if `max(0, c - N/2)`
- *
- * Function implementation:
- *
- * inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
- * And the meaning of each dimension(0-3) is respectively batch size,
- * feature maps, rows and columns.
- *
- * Input and Output in the above formula is for each map(i) of one image, and
- * Input(i, x, y), Output(i, x, y) represents an element in an image.
- *
- * C is the number of feature maps of one image, and N is a hyper-parameters
- * is configured when Function is initialized. The sum in the denominator
- * is the sum of the same position in the neighboring maps.
- *
- * In the implementation of Function, k is equal to 1,
- * so Function has no argument for k.
- *
- * Function Arguments:
- *
- * \param size_      represent N
- * \param scale_     represent alpha
- * \param pow_       represent beta
- * \param inputs[0]  represent Input
- * \param outputs[0] represent Output
- * \param outputs[1] represent The denominator in the formula(except beta)
- *
- * Note:
- * Save output[1] is to simplify the backward calculation.
- * TODO, if only consider the forward calculation, we can optimize to
- * remove the output[1].
- */
-template <DeviceType Device>
-class CrossMapNormalFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    size_ = config.get<size_t>("size");
-    scale_ = config.get<real>("scale");
-    pow_ = config.get<real>("pow");
-
-    // number of inputs and outputs
-    numInputs_ = 1;
-    numOutputs_ = 2;
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    check(inputs, outputs);
-    // ArgType check still on here,
-    // not sure whether it is better to put inside the check.
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    CrossMapNormal<Device>(outputs[0].data<real>(),
-                           outputs[1].data<real>(),
-                           inputs[0].data<real>(),
-                           batchSize,
-                           maps,
-                           rows,
-                           columns,
-                           size_,
-                           scale_,
-                           pow_);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
-    CHECK(inputs[0].shape() == outputs[0].shape());
-    CHECK(inputs[0].shape() == outputs[1].shape());
-  }
-
-  // Only need the shape of the input, can calculate the
-  // floating-point operation.
-  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)numInputs_, inputs.size());
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    // number of floating-point operations
-    // an approximate value
-    size_t ops = batchSize * maps * rows * columns * (size_ * 2 + 3);
-
-    return ops;
-  }
-
- private:
-  size_t size_;
-  real scale_;
-  real pow_;
-};
-
-/**
- * \brief Backward calculation for normalization with across maps.
- *
- * Function implementation:
- *
- * The implementation of this Function is derived from the
- * CrossMapNormalFunc implementation.
- *
- * InputGrad = OutputGrad * denoms ^ (-beta)
- *    -- upper
- *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / denoms) * InputValue
- *    -- lower
- *
- * The data of inputs/outputs format is the same as the forward interface
- * and is NCHW.
- *
- * The upper and lower is the same as forward. The logic of the sum
- * is also the same as forward.
- *
- * Function Arguments:
- *
- * \param size_      represent N
- * \param scale_     represent alpha
- * \param pow_       represent beta
- * \param inputs[0]  represent InputValue, inputs[0] of CrossMapNormalFunc
- * \param inputs[1]  represent OutputValue, outputs[0] of CrossMapNormalFunc
- * \param inputs[2]  represent OutputGrad
- * \param inputs[3]  represent denoms, outputs[1] of CrossMapNormalFunc
- *                   This is the intermediate result that is
- *                   preserved in the forward calculation.
- * \param outputs[0] represent InputGrad
- */
-template <DeviceType Device>
-class CrossMapNormalGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    size_ = config.get<size_t>("size");
-    scale_ = config.get<real>("scale");
-    pow_ = config.get<real>("pow");
-
-    // number of inputs and outputs
-    numInputs_ = 4;
-    numOutputs_ = 1;
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    check(inputs, outputs);
-    if (outputs[0].getArgType() != ADD_TO) {
-      // Currently, some algorithm implementations are ASSIGN_TO mode,
-      // if need to support the ADD_TO calculation, need to clear the output.
-      typename Tensor<real, Device>::Vector tmp(
-          outputs[0].shape().getElements(), outputs[0].data<real>());
-      tmp.zero();
-    }
-
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
-                               inputs[0].data<real>(),
-                               inputs[1].data<real>(),
-                               inputs[2].data<real>(),
-                               inputs[3].data<real>(),
-                               batchSize,
-                               maps,
-                               rows,
-                               columns,
-                               size_,
-                               scale_,
-                               pow_);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
-    CHECK(inputs[0].shape() == inputs[1].shape());
-    CHECK(inputs[0].shape() == inputs[2].shape());
-    CHECK(inputs[0].shape() == inputs[3].shape());
-    CHECK(inputs[0].shape() == outputs[0].shape());
-  }
-
-  // Only need the shape of one input, can calculate the
-  // floating-point operation.
-  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_LT((size_t)1, inputs.size());
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    // number of floating-point operations
-    // an approximate value
-    size_t ops = batchSize * maps * rows * columns * (size_ * 4 + 2);
-
-    return ops;
-  }
-
- private:
-  size_t size_;
-  real scale_;
-  real pow_;
-};
-
-REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
-REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
-REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
deleted file mode 100644
index 2c0e71b19b22abac25d273d8bbeddc330e67f8b0..0000000000000000000000000000000000000000
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ /dev/null
@@ -1,376 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DepthwiseConvOp.h"
-#include "paddle/math/BaseMatrix.h"
-
-namespace paddle {
-
-// CUDA kernel to compute the depthwise convolution forward pass
-template <class T>
-__global__ void ConvolutionDepthwiseForward(const int nthreads,
-                                            const T* const inputData,
-                                            const T* const filterData,
-                                            const int batchSize,
-                                            const int outputChannels,
-                                            const int outputHeight,
-                                            const int outputWidth,
-                                            const int inputChannels,
-                                            const int inputHeight,
-                                            const int inputWidth,
-                                            const int filterMultiplier,
-                                            const int filterHeight,
-                                            const int filterWidth,
-                                            const int strideH,
-                                            const int strideW,
-                                            const int paddingH,
-                                            const int paddingW,
-                                            T* const outputData) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if (index < nthreads) {
-    const int batch = index / outputChannels / outputHeight / outputWidth;
-    const int c_out = (index / outputHeight / outputWidth) % outputChannels;
-    const int h_out = (index / outputWidth) % outputHeight;
-    const int w_out = index % outputWidth;
-
-    const int c_in = c_out / filterMultiplier;
-    const T* weight = filterData + c_out * filterHeight * filterWidth;
-    T value = 0;
-    const int h_in_start = -paddingH + h_out * strideH;
-    const int w_in_start = -paddingW + w_out * strideW;
-    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
-    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
-    if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) &&
-        (w_in_end < inputWidth)) {
-      for (int kh = 0; kh < filterHeight; ++kh) {
-        for (int kw = 0; kw < filterWidth; ++kw) {
-          const int h_in = -paddingH + h_out * strideH + kh;
-          const int w_in = -paddingW + w_out * strideW + kw;
-          const int offset =
-              ((batch * inputChannels + c_in) * inputHeight + h_in) *
-                  inputWidth +
-              w_in;
-          value += (*weight) * inputData[offset];
-          ++weight;
-        }
-      }
-    } else {
-      for (int kh = 0; kh < filterHeight; ++kh) {
-        for (int kw = 0; kw < filterWidth; ++kw) {
-          const int h_in = -paddingH + h_out * strideH + kh;
-          const int w_in = -paddingW + w_out * strideW + kw;
-          if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
-              (w_in < inputWidth)) {
-            const int offset =
-                ((batch * inputChannels + c_in) * inputHeight + h_in) *
-                    inputWidth +
-                w_in;
-            value += (*weight) * inputData[offset];
-          }
-          ++weight;
-        }
-      }
-    }
-    outputData[index] = value;
-  }
-}
-
-// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
-template <class T>
-__global__ void ConvolutionDepthwiseInputBackward(const int nthreads,
-                                                  const T* const top_diff,
-                                                  const T* const weight_data,
-                                                  const int num,
-                                                  const int outputChannels,
-                                                  const int outputHeight,
-                                                  const int outputWidth,
-                                                  const int inputChannels,
-                                                  const int inputHeight,
-                                                  const int inputWidth,
-                                                  const int filterMultiplier,
-                                                  const int filterHeight,
-                                                  const int filterWidth,
-                                                  const int strideH,
-                                                  const int strideW,
-                                                  const int paddingH,
-                                                  const int paddingW,
-                                                  T* const bottom_diff) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    const int batch = index / inputChannels / inputHeight / inputWidth;
-    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
-    const int h_in = (index / inputWidth) % inputHeight;
-    const int w_in = index % inputWidth;
-
-    const int c_out_start = c_in * filterMultiplier;
-
-    int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
-    h_out_start = 0 > h_out_start ? 0 : h_out_start;
-    int h_out_end = (h_in + paddingH) / strideH;
-    h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
-    int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
-    w_out_start = 0 > w_out_start ? 0 : w_out_start;
-    int w_out_end = (w_in + paddingW) / strideW;
-    w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
-
-    T value = 0;
-
-    for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
-         c_out++) {
-      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
-        const int filter_h = h_in + paddingH - h_out * strideH;
-        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
-          const int filter_w = w_in + paddingW - w_out * strideW;
-          const int filter_offset = c_out * filterHeight * filterWidth +
-                                    filter_h * filterWidth + filter_w;
-          const int top_diff_offset =
-              ((batch * outputChannels + c_out) * outputHeight + h_out) *
-                  outputWidth +
-              w_out;
-          value += top_diff[top_diff_offset] * weight_data[filter_offset];
-        }
-      }
-    }
-    bottom_diff[index] += value;
-  }
-}
-
-// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
-template <class T>
-__global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
-                                                   const int nthreads,
-                                                   const T* const top_diff,
-                                                   const T* const inputData,
-                                                   const int num,
-                                                   const int outputChannels,
-                                                   const int outputHeight,
-                                                   const int outputWidth,
-                                                   const int inputChannels,
-                                                   const int inputHeight,
-                                                   const int inputWidth,
-                                                   const int filterMultiplier,
-                                                   const int filterHeight,
-                                                   const int filterWidth,
-                                                   const int strideH,
-                                                   const int strideW,
-                                                   const int paddingH,
-                                                   const int paddingW,
-                                                   T* const buffer_data) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    const int h_out = (index / outputWidth) % outputHeight;
-    const int w_out = index % outputWidth;
-    const int kh =
-        (index / filterWidth / outputHeight / outputWidth) % filterHeight;
-    const int kw = (index / outputHeight / outputWidth) % filterWidth;
-    const int h_in = -paddingH + h_out * strideH + kh;
-    const int w_in = -paddingW + w_out * strideW + kw;
-    if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
-        (w_in < inputWidth)) {
-      const int c_out =
-          index / (filterHeight * filterWidth * outputHeight * outputWidth);
-      const int c_in = c_out / filterMultiplier;
-      const int batch = num_i;
-      const int top_offset =
-          ((batch * outputChannels + c_out) * outputHeight + h_out) *
-              outputWidth +
-          w_out;
-      const int bottom_offset =
-          ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
-          w_in;
-      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
-    } else {
-      buffer_data[index] = 0;
-    }
-  }
-}
-
-template <class T>
-class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* inputData,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* outputData) {
-    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
-
-    size_t blocks = (outputSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    ConvolutionDepthwiseForward<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        outputSize,
-        inputData,
-        filterData,
-        batchSize,
-        outputChannels,
-        outputHeight,
-        outputWidth,
-        inputChannels,
-        inputHeight,
-        inputWidth,
-        filterMultiplier,
-        filterHeight,
-        filterWidth,
-        strideH,
-        strideW,
-        paddingH,
-        paddingW,
-        outputData);
-  }
-};
-
-template <class T>
-class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* outputGrad,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* inputGrad) {
-    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
-
-    size_t blocks = (inputSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    ConvolutionDepthwiseInputBackward<T>
-        // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(inputSize,
-                                               outputGrad,
-                                               filterData,
-                                               batchSize,
-                                               outputChannels,
-                                               outputHeight,
-                                               outputWidth,
-                                               inputChannels,
-                                               inputHeight,
-                                               inputWidth,
-                                               filterMultiplier,
-                                               filterHeight,
-                                               filterWidth,
-                                               strideH,
-                                               strideW,
-                                               paddingH,
-                                               paddingW,
-                                               inputGrad);
-  }
-};
-
-template <class T>
-class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* outputGrad,
-                  const T* inputData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* colData,
-                  T* filterGrad) {
-    int colDataSize = outputChannels * filterHeight * filterWidth *
-                      outputHeight * outputWidth;
-
-    size_t blocks = (colDataSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-    BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
-                                1,
-                                filterGrad,
-                                false,
-                                true);
-
-    for (int i = 0; i < batchSize; i++) {
-      ConvolutionDepthwiseFilterBackward<
-          T><<<grid, threads, 0, STREAM_DEFAULT>>>(i,
-                                                   colDataSize,
-                                                   outputGrad,
-                                                   inputData,
-                                                   batchSize,
-                                                   outputChannels,
-                                                   outputHeight,
-                                                   outputWidth,
-                                                   inputChannels,
-                                                   inputHeight,
-                                                   inputWidth,
-                                                   filterMultiplier,
-                                                   filterHeight,
-                                                   filterWidth,
-                                                   strideH,
-                                                   strideW,
-                                                   paddingH,
-                                                   paddingW,
-                                                   colData);
-      int K = outputHeight * outputWidth;
-      int M = colDataSize / K;
-
-      BaseMatrix colMatrix(M, K, colData, false, true);
-      filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
-    }
-  }
-};
-
-#ifdef PADDLE_TYPE_DOUBLE
-template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
-template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
-template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
-#else
-template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
-template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
-template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp
deleted file mode 100644
index 8e9dbbd7a154095a7298bb2f59a82d13a60f9bd3..0000000000000000000000000000000000000000
--- a/paddle/function/EigenGemm.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include "paddle/function/EigenThreadDevice.h"
-
-namespace paddle {
-
-template <class T>
-struct EigenBlasGemm {
-  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
-                           Eigen::Aligned>
-      EigenMatrix;
-
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-    Eigen::array<int, 2> sizeA;
-    if (transA) {
-      sizeA[0] = K;
-      sizeA[1] = M;
-      CHECK_EQ(M, lda);
-    } else {
-      sizeA[0] = M;
-      sizeA[1] = K;
-      CHECK_EQ(K, lda);
-    }
-    Eigen::array<int, 2> sizeB;
-    if (transB) {
-      sizeB[0] = N;
-      sizeB[1] = K;
-      CHECK_EQ(K, ldb);
-    } else {
-      sizeB[0] = K;
-      sizeB[1] = N;
-      CHECK_EQ(N, ldb);
-    }
-    Eigen::array<int, 2> sizeC = {{M, ldc}};
-    Eigen::array<int, 2> offsetC = {{0, 0}};
-    Eigen::array<int, 2> extentC = {{M, N}};
-
-    const EigenMatrix a(const_cast<T*>(A), sizeA);
-    const EigenMatrix b(const_cast<T*>(B), sizeB);
-    EigenMatrix c(C, sizeC);
-
-    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
-    Eigen::array<DimPair, 1> dims;
-    dims[0] = DimPair(1, 0);
-    dims[0].first = transA ? 0 : 1;
-    dims[0].second = transB ? 1 : 0;
-
-    auto* device = EigenDeviceWarpper::device();
-    if (N == ldc) {
-      if (alpha == T(1) && beta == T(0)) {
-        c.device(*device) = a.contract(b, dims);
-      } else if (alpha == T(1) && beta == T(1)) {
-        c.device(*device) += a.contract(b, dims);
-      } else {
-        c.device(*device) = alpha * a.contract(b, dims) + beta * c;
-      }
-    } else {
-      if (alpha == T(1) && beta == T(0)) {
-        c.slice(offsetC, extentC).device(*device) = a.contract(b, dims);
-      } else if (alpha == T(1) && beta == T(1)) {
-        c.slice(offsetC, extentC).device(*device) += a.contract(b, dims);
-      } else {
-        c.slice(offsetC, extentC).device(*device) =
-            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
-      }
-    }
-    EigenDeviceWarpper::free_device(device);
-  }
-};
-
-#ifdef PADDLE_TYPE_DOUBLE
-template struct EigenBlasGemm<double>;
-#else
-template struct EigenBlasGemm<float>;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
deleted file mode 100644
index a6c14ef29b760faa393c37bd2357824a061c7b38..0000000000000000000000000000000000000000
--- a/paddle/function/Function.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <vector>
-#include "BufferArg.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Any.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Error.h"
-
-namespace paddle {
-
-/**
- * Function Configuration.
- * The argument type of Function::init.
- */
-class FuncConfig {
- public:
-  template <typename T>
-  T get(const std::string& key, Error* err = nullptr) const {
-    try {
-      return any_cast<T>(valueMap_.at(key));
-    } catch (std::exception& e) {  // could be cast or out of range exception.
-      if (err) {
-        *err = Error(e.what());
-      } else {
-        LOG(FATAL) << "Cannot get key " << key << " with error " << e.what();
-      }
-      return T();
-    }
-  }
-
-  template <typename T>
-  FuncConfig& set(const std::string& key, T v, Error* err = nullptr) {
-    auto it = valueMap_.find(key);
-    if (it != valueMap_.end()) {  // already contains key.
-      if (err) {
-        *err = Error("Key %s is already set in FuncConfig", key.c_str());
-      } else {
-        LOG(FATAL) << "Key " << key << " is already set in FuncConfig.";
-      }
-      return *this;
-    }
-    valueMap_[key] = any(v);
-    return *this;
-  }
-
- protected:
-  mutable std::unordered_map<std::string, any> valueMap_;
-};
-
-/**
- * Argument type for Function::calc().
- * A BufferArgs contains a set of BufferArg,
- * because Function can have multiple inputs and outputs.
- *
- * addArg() with Matix object used to adapt Layer Argument.
- * Will create a BufferArg object in addArg(),
- * and free in destructor of BufferArgs.
- *
- * addArg() with BufferArg object, just save BufferArg object address,
- * and the caller needs to guarantee the validity of the BufferArg object
- * in the BufferArgs life time.
- */
-class BufferArgs {
- public:
-  BufferArgs() {}
-
-  ~BufferArgs() {
-    for (auto arg : _args_) {
-      delete arg;
-    }
-  }
-
-  size_t size() const { return args_.size(); }
-
-  // add argument into BufferArgs
-  // Tensor can be Matrix, Vector, IVector.
-  // For inputs, do not need argType.
-  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
-  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  // Add arg into BufferArgs and reshape the arg.
-  //
-  // For example, arg represents an image buffer,
-  // but Matrix can only represent a two-dimensional Tensor.
-  // So need an extra argument to describe the shape of the image buffer.
-  void addArg(const Matrix& arg,
-              const TensorShape& shape,
-              ArgType argType = UNSPECIFIED);
-
-  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
-  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
-
-  void addArg(const Matrix& matrix,
-              const IVector& vector,
-              ArgType argType = UNSPECIFIED);
-
-  // get argument
-  const BufferArg& operator[](size_t num) const {
-    CHECK_LT(num, args_.size());
-    return *args_[num];
-  }
-
-  void addArg(BufferArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
-
- private:
-  std::vector<BufferArg*> args_;
-  // The BufferArg object is constructed and freed by BufferArgs.
-  std::vector<BufferArg*> _args_;
-};
-
-/**
- * \brief Base class for Function.
- * The basic Function implementation requires override init and calc interfaces.
- *
- * The caller needs to ensure the validity of the arguments
- * during Function execution.
- *
- * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
- * and ADD_TO.
- * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
- * result of Function assigned to the output BufferArg.
- * If output.getArgType() == ADD_TO, this is add mode, and the calculation
- * result of Function need added to the output BufferArg.
- *
- * For example:
- * ASSIGN_TO: output = Function(inputs)
- * ADD_TO: output += Function(inputs)
- * If Function has more than one output, each output can have different modes.
- */
-class FunctionBase {
- public:
-  virtual ~FunctionBase() {}
-
-  virtual void init(const FuncConfig& config) {}
-
-  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
-
-  // This member function is used to check whether the BufferType and shape of
-  // the inputs and outputs arguments of the Function are correct.
-  // General calc function which will call this check to do arguments check.
-  // And before the calc called, the caller can also check their own arguments.
-  virtual void check(const BufferArgs& inputs, const BufferArgs& outputs) {}
-
-  // Calculate the number of floating-point operations of this Function.
-  // The inputs and outputs arguments do not need to contain the actual data,
-  // only the shape.
-  // And some Functions have the same input and output shapes,
-  // so you may not need to enter the complete number of arguments.
-  // But entering the full arguments is always correct for this interface.
-  virtual size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) {
-    return 0;
-  }
-
-  int getNumInputs() const { return numInputs_; }
-
-  int getNumOutputs() const { return numOutputs_; }
-
-  static ClassRegistrar<FunctionBase> funcRegistrar_;
-
- protected:
-  // numInputs_ and numOutputs_ represents the maximum
-  // input and output supported by Function.
-  // Some functions are optimized for input and output,
-  // so when comparing the number of arguments, for these functions
-  // inputs.size() <= numInputs_ or outputs.size() <= numOutputs_
-  size_t numInputs_;
-  size_t numOutputs_;
-};
-
-#define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName
-
-#define REGISTER_TYPED_FUNC(typeName, deviceName, className)   \
-  static InitFunction __reg_type_##typeName##deviceName([]() { \
-    FunctionBase::funcRegistrar_                               \
-        .registerClass<className<DEVICE_TYPE_##deviceName>>(   \
-            FUNC_NAME(typeName, deviceName));                  \
-  })
-
-}  // namespace paddle
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
deleted file mode 100644
index f5e6ca3f515a7fcd1498979703a0a59ddca40742..0000000000000000000000000000000000000000
--- a/paddle/function/FunctionTest.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-#include <gtest/gtest.h>
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-
-template <DeviceType DType>
-void FunctionApi(typename Tensor<real, DType>::Matrix& output,
-                 const typename Tensor<real, DType>::Matrix& input);
-
-template <>
-void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 100U);
-  EXPECT_EQ(output.getWidth(), 200U);
-}
-
-template <>
-void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 10U);
-  EXPECT_EQ(output.getWidth(), 20U);
-}
-
-template <DeviceType DType>
-void Function(const BufferArgs& arguments) {
-  const auto input = arguments[0].matrix<DType>();
-  auto output = arguments[1].matrix<DType>();
-  FunctionApi<DType>(output, input);
-}
-
-TEST(Function, BufferArgs) {
-  CpuMatrix cpuInput = CpuMatrix(100, 200);
-  CpuMatrix cpuOutput = CpuMatrix(100, 200);
-  BufferArgs cpuArgments;
-  cpuArgments.addArg(cpuInput);
-  cpuArgments.addArg(cpuOutput);
-  Function<DEVICE_TYPE_CPU>(cpuArgments);
-
-  GpuMatrix gpuInput = GpuMatrix(10, 20);
-  GpuMatrix gpuOutput = GpuMatrix(10, 20);
-  BufferArgs gpuArgments;
-  gpuArgments.addArg(gpuInput);
-  gpuArgments.addArg(gpuOutput);
-  Function<DEVICE_TYPE_GPU>(gpuArgments);
-}
-
-/**
- * Some tests case are used to check the consistency between the BufferArg type
- * argument received by Function and the original type argument.
- *
- * Use Case:
- *  TEST() {
- *    Matrix matrix(...);
- *    CheckBufferArg lambda = [=](const BufferArg& arg) {
- *      // check matrix and arg are equivalent
- *      EXPECT_EQ(matrix, arg);
- *    }
- *
- *   BufferArgs argments{matrix...};
- *   std::vector<CheckBufferArg> checkFunc{lambda...};
- *   testBufferArgs(argments, checkFunc);
- *  }
- */
-typedef std::function<void(const BufferArg&)> CheckBufferArg;
-
-void testBufferArgs(const BufferArgs& inputs,
-                    const std::vector<CheckBufferArg>& check) {
-  EXPECT_EQ(inputs.size(), check.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    check[i](inputs[i]);
-  }
-}
-
-void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
-  EXPECT_EQ(inputs.size(), 1U);
-  check(inputs[0]);
-}
-
-TEST(Arguments, Matrix) {
-  MatrixPtr matrix = Matrix::create(100, 200);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2U);
-    EXPECT_EQ(arg.shape()[0], 100U);
-    EXPECT_EQ(arg.shape()[1], 200U);
-    EXPECT_EQ(arg.data(), matrix->getData());
-
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
-  };
-
-  BufferArgs argments;
-  argments.addArg(*matrix);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, Vector) {
-  VectorPtr vector = Vector::create(100, false);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 1U);
-    EXPECT_EQ(arg.shape()[0], 100U);
-    EXPECT_EQ(arg.data(), vector->getData());
-
-    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
-    EXPECT_EQ(inVector.getSize(), vector->getSize());
-    EXPECT_EQ(inVector.getData(), vector->getData());
-  };
-
-  BufferArgs argments;
-  argments.addArg(*vector);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, CpuSparseMatrix) {
-  CpuSparseMatrix sparse(200, 300, 50);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2U);
-    EXPECT_EQ(arg.shape()[0], 200U);
-    EXPECT_EQ(arg.shape()[1], 300U);
-    EXPECT_EQ(arg.data(), sparse.getData());
-    // CHECK_EQ(arg.sparse().nnz(), 50);
-    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
-    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
-    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
-    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
-  };
-
-  BufferArgs argments;
-  argments.addArg(sparse);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, BufferArg) {
-  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 3U);
-    EXPECT_EQ(arg.shape()[0], 1U);
-    EXPECT_EQ(arg.shape()[1], 2U);
-    EXPECT_EQ(arg.shape()[2], 3U);
-  };
-
-  BufferArgs argments;
-  argments.addArg(arg);
-  testBufferArgs(argments, check);
-}
-
-}  // namespace paddle
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
deleted file mode 100644
index 14003d2c885c8f846f9445ad8844869c9112816e..0000000000000000000000000000000000000000
--- a/paddle/function/FunctionTest.h
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/math/tests/TensorCheck.h"
-#include "paddle/testing/TestUtil.h"
-
-namespace paddle {
-
-typedef std::shared_ptr<BufferArg> BufferArgPtr;
-
-namespace test {
-template <DeviceType DType>
-struct Allocator;
-
-template <>
-struct Allocator<DEVICE_TYPE_CPU> {
-  using type = CpuMemoryHandle;
-};
-
-template <>
-struct Allocator<DEVICE_TYPE_GPU> {
-  using type = GpuMemoryHandle;
-};
-
-// Copy argument1 to argument2
-template <DeviceType DType1, DeviceType DType2>
-class CopyArgument {
- public:
-  void operator()(const BufferArg& arg1, BufferArg& arg2) {
-    CHECK_EQ(arg1.valueType(), arg2.valueType());
-    CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
-
-    if (arg1.valueType() == VALUE_TYPE_INT32) {
-      IVectorPtr vector1 =
-          IVector::create((int*)arg1.data(),
-                          arg1.shape().getElements(),
-                          DType1 == DEVICE_TYPE_CPU ? false : true);
-      IVectorPtr vector2 =
-          IVector::create((int*)arg2.data(),
-                          arg2.shape().getElements(),
-                          DType2 == DEVICE_TYPE_CPU ? false : true);
-      vector2->copyFrom(*vector1);
-    } else {
-      VectorPtr vector1 =
-          Vector::create((real*)arg1.data(),
-                         arg1.shape().getElements(),
-                         DType1 == DEVICE_TYPE_CPU ? false : true);
-      VectorPtr vector2 =
-          Vector::create((real*)arg2.data(),
-                         arg2.shape().getElements(),
-                         DType2 == DEVICE_TYPE_CPU ? false : true);
-      vector2->copyFrom(*vector1);
-    }
-  }
-};
-}  // namespace test
-
-/**
- * \brief A class for comparing two Functions of different implementations.
- *        For example, can be used to compare the CPU and GPU implementation
- *        of the function is consistent.
- *
- * Use case:
- *  // Initializes a test object, the corresponding cpu and gpu Function
- *  // are constructed according to FunctionName and FuncConfig.
- *  CpuGpuFuncCompare test(FunctionName, FuncConfig);
- *  // Prepare inputs and outputs arguments.
- *  // Here the input and output can not contain real data,
- *  // only contains the argument type and shape.
- *  test.addInputs(input1);
- *  test.addInputs(input2);
- *  test.addOutputs(output1);
- *  test.addOutputs(output2);
- *  // Run.
- *  // Will according to the type and shape of arguments(inputs_/outputs_),
- *  // automatic initialization cpu and gpu function required arguments
- *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
- *  // Call the CPU and GPU Function calculation results.
- *  // Compares CPU and GPU calculation results for consistency.
- *  test.run();
- */
-template <DeviceType DType1, DeviceType DType2>
-class Compare2Function {
- public:
-  typedef typename test::Allocator<DType1>::type Allocator1;
-  typedef typename test::Allocator<DType2>::type Allocator2;
-  typedef typename Tensor<real, DType1>::Vector Vector1;
-  typedef typename Tensor<real, DType2>::Vector Vector2;
-  typedef typename Tensor<real, DType1>::SparseMatrix SparseMatrix1;
-  typedef typename Tensor<real, DType2>::SparseMatrix SparseMatrix2;
-
-  Compare2Function(const std::string& name1,
-                   const std::string& name2,
-                   const FuncConfig& config)
-      : function1_(FunctionBase::funcRegistrar_.createByType(name1)),
-        function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
-    function1_->init(config);
-    function2_->init(config);
-    initArgsCallback_ = nullptr;
-  }
-
-  ~Compare2Function() {}
-
-  // input need only contains shape, do not contains data.
-  void addInputs(const BufferArg& input) {
-    size_t size =
-        input.shape().getElements() * sizeOfValuType(input.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    func1Inputs_.emplace_back(std::make_shared<BufferArg>(
-        func1Memory_.back()->getBuf(), input.valueType(), input.shape()));
-    func2Inputs_.emplace_back(std::make_shared<BufferArg>(
-        func2Memory_.back()->getBuf(), input.valueType(), input.shape()));
-  }
-
-  // assume one copy of sequence is shared by different SequenceArgs
-  void addSequence(const SequenceIdArg& input) {
-    CHECK_EQ(input.shape().ndims(), 1UL);
-    size_t batchSize = input.shape()[0];
-    size_t numSeqs = batchSize / 10 + 1;
-    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(sizeId));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(sizeId));
-    seq1_ = std::make_shared<SequenceIdArg>(func1Memory_.back()->getBuf(),
-                                            TensorShape{numSeqs + 1});
-    seq2_ = std::make_shared<SequenceIdArg>(func2Memory_.back()->getBuf(),
-                                            TensorShape{numSeqs + 1});
-    /// init sequence Id
-    initArg(*seq1_, batchSize);
-
-    copyArg_(*seq1_, *seq2_);
-  }
-
-  void addInputs(const SequenceArg& input) {
-    CHECK_EQ(input.shape().ndims(), 2UL);
-    size_t batchSize = input.shape()[0];
-    if (!seq1_ || !seq2_) {  // sequence not exist
-      addSequence(SequenceIdArg(TensorShape{batchSize}));
-    }
-
-    size_t size =
-        input.shape().getElements() * sizeOfValuType(input.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    /// SequenceArg
-    func1Inputs_.emplace_back(
-        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
-                                      input.valueType(),
-                                      input.shape(),
-                                      *seq1_));
-    func2Inputs_.emplace_back(
-        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
-                                      input.valueType(),
-                                      input.shape(),
-                                      *seq2_));
-  }
-
-  void registerInitCallback(std::function<void(BufferArg&, size_t)> callback) {
-    initArgsCallback_ = callback;
-  }
-
-  // output need only contains shape, do not contains data.
-  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
-    size_t size =
-        output.shape().getElements() * sizeOfValuType(output.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    func1Outputs_.emplace_back(
-        std::make_shared<BufferArg>(func1Memory_.back()->getBuf(),
-                                    output.valueType(),
-                                    output.shape(),
-                                    argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<BufferArg>(func2Memory_.back()->getBuf(),
-                                    output.valueType(),
-                                    output.shape(),
-                                    argType));
-  }
-
-  /// add and init output sparse matrix
-  void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
-    sparse1_ = std::make_shared<SparseMatrix1>(
-        output.shape()[0],
-        output.shape()[1],
-        output.nnz(),
-        static_cast<SparseValueType>(output.dataType()),
-        static_cast<SparseFormat>(output.dataFormat()));
-
-    sparse2_ = std::make_shared<SparseMatrix2>(
-        output.shape()[0],
-        output.shape()[1],
-        output.nnz(),
-        static_cast<SparseValueType>(output.dataType()),
-        static_cast<SparseFormat>(output.dataFormat()));
-
-    /// init sparse matrix
-    hl_stream_t stream(HPPL_STREAM_1);
-    sparse1_->randomizeUniform();
-    sparse2_->copyFrom(*sparse1_, stream);
-    hl_stream_synchronize(stream);
-
-    func1Outputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*sparse1_, argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*sparse2_, argType));
-  }
-
-  void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
-    CHECK_EQ(output.shape().ndims(), 2UL);
-    size_t batchSize = output.shape()[0];
-
-    if (!seq1_ || !seq2_) {  // sequence not exist
-      addSequence(SequenceIdArg(TensorShape{batchSize}));
-    }
-    size_t size =
-        output.shape().getElements() * sizeOfValuType(output.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    /// SequenceArg
-    func1Outputs_.emplace_back(
-        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
-                                      output.valueType(),
-                                      output.shape(),
-                                      *seq1_,
-                                      argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
-                                      output.valueType(),
-                                      output.shape(),
-                                      *seq2_,
-                                      argType));
-  }
-
-  void addInputs(const SparseMatrixArg& input) {
-    sparse1_ = std::make_shared<SparseMatrix1>(
-        input.shape()[0],
-        input.shape()[1],
-        input.nnz(),
-        static_cast<SparseValueType>(input.dataType()),
-        static_cast<SparseFormat>(input.dataFormat()));
-
-    sparse2_ = std::make_shared<SparseMatrix2>(
-        input.shape()[0],
-        input.shape()[1],
-        input.nnz(),
-        static_cast<SparseValueType>(input.dataType()),
-        static_cast<SparseFormat>(input.dataFormat()));
-
-    /// init sparse matrix
-    hl_stream_t stream(HPPL_STREAM_1);
-    sparse1_->randomizeUniform();
-    sparse2_->copyFrom(*sparse1_, stream);
-    hl_stream_synchronize(stream);
-
-    func1Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse1_));
-    func2Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse2_));
-  }
-
-  void run() {
-    // prepare cpu/gpu arguments
-    initInputs();
-
-    initOutputs();
-    // function calculate
-    auto callFunction = [](FunctionBase* function,
-                           std::vector<BufferArgPtr>& inputs,
-                           std::vector<BufferArgPtr>& outputs) {
-      BufferArgs inArgs;
-      BufferArgs outArgs;
-      for (auto arg : inputs) {
-        inArgs.addArg(*arg);
-      }
-      for (auto arg : outputs) {
-        outArgs.addArg(*arg);
-      }
-      function->calc(inArgs, outArgs);
-    };
-
-    callFunction(function1_.get(), func1Inputs_, func1Outputs_);
-    callFunction(function2_.get(), func2Inputs_, func2Outputs_);
-
-    // check outputs
-    compareOutputs();
-  }
-
-  std::shared_ptr<FunctionBase> getFunction1() const { return function1_; }
-
-  std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
-
- protected:
-  // only init cpu argument, gpu argument copy from cpu argument.
-  void initArg(BufferArg& arg) {
-    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
-    vector.uniform(0.001, 1);
-  }
-
-  void initArg(SequenceArg& arg) {
-    /// init only matrix
-    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
-    vector.uniform(0.001, 1);
-  }
-
-  void initArg(SequenceIdArg& arg, size_t batchSize) {
-    size_t numSeqs = arg.numSeqs();
-    int* buf = reinterpret_cast<int*>(arg.data());
-    int pos = 0;
-    size_t maxLen = 2 * batchSize / numSeqs;
-    for (int i = 0; i < (int)numSeqs; ++i) {
-      int len = 1 + uniformRandom(std::min<int64_t>(
-                        maxLen, batchSize - pos - numSeqs + i));
-      buf[i] = pos;
-      pos += len;
-      VLOG(1) << " len=" << len;
-    }
-    buf[numSeqs] = batchSize;
-  }
-
-  void initInputs() {
-    for (size_t i = 0; i < func1Inputs_.size(); i++) {
-      if (func1Inputs_[i]->isSparseArg()) {
-        continue;  /// sparse matrix already init
-      }
-
-      if (func1Inputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*func1Inputs_[i]));
-      } else {
-        initArg(*func1Inputs_[i]);
-      }
-
-      if (initArgsCallback_ != nullptr) {
-        initArgsCallback_(*func1Inputs_[i], i);
-      }
-
-      copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
-    }
-  }
-
-  void initOutputs() {
-    for (size_t i = 0; i < func1Outputs_.size(); i++) {
-      if (func1Outputs_[i]->isSparseArg()) {
-        continue;  /// sparse matrix already init
-      }
-
-      if (func1Outputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*func1Outputs_[i]));
-      } else {
-        initArg(*func1Outputs_[i]);
-      }
-
-      copyArg_(*func1Outputs_[i], *func2Outputs_[i]);
-    }
-  }
-
-  void compareOutputs() {
-    for (size_t i = 0; i < func1Outputs_.size(); i++) {
-      // TODO, Need a BufferCheck used to compare the two buffers.
-      const auto cpu = func1Outputs_[i];
-      const auto gpu = func2Outputs_[i];
-      CHECK_EQ(cpu->numElements(), gpu->numElements());
-      Vector1 cpuVector(cpu->numElements(), (real*)cpu->data());
-      Vector2 gpuVector(gpu->numElements(), (real*)gpu->data());
-      autotest::TensorCheckErr(cpuVector, gpuVector);
-    }
-  }
-
- protected:
-  std::shared_ptr<FunctionBase> function1_;
-  std::shared_ptr<FunctionBase> function2_;
-  std::vector<std::shared_ptr<Allocator1>> func1Memory_;
-  std::vector<std::shared_ptr<Allocator2>> func2Memory_;
-  std::vector<BufferArgPtr> func1Inputs_;
-  std::vector<BufferArgPtr> func1Outputs_;
-  std::vector<BufferArgPtr> func2Inputs_;
-  std::vector<BufferArgPtr> func2Outputs_;
-  std::shared_ptr<SparseMatrix1> sparse1_;
-  std::shared_ptr<SparseMatrix2> sparse2_;
-  std::shared_ptr<SequenceIdArg> seq1_;
-  std::shared_ptr<SequenceIdArg> seq2_;
-  test::CopyArgument<DType1, DType2> copyArg_;
-  std::function<void(BufferArg&, size_t)> initArgsCallback_;
-};
-
-class CpuGpuFuncCompare
-    : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
- public:
-  CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
-      : Compare2Function(name + "-CPU", name + "-GPU", config) {}
-
-  ~CpuGpuFuncCompare() {}
-};
-
-}  // namespace paddle
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
deleted file mode 100644
index 5b023e2c10e5040a28660d555efceb0e26b40d49..0000000000000000000000000000000000000000
--- a/paddle/function/GemmConvOp.cpp
+++ /dev/null
@@ -1,522 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOp.h"
-#include "GemmFunctor.h"
-#include "Im2Col.h"
-#include "paddle/math/MemoryHandle.h"
-
-namespace paddle {
-
-/*
- * \brief Forward calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // TODO(hedaoyuan): Need to define some index macros,
-    // to avoid useing 0 and 1.
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColFunctor<kCFO, Device, real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          im2col(inputData + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        } else {
-          colData = inputData + g * inputOffset;
-        }
-        int M = outputChannels / groups_;
-        int N = outputHeight * outputWidth;
-        int K = inputChannels / groups_ * filterHeight * filterWidth;
-        BlasGemm<Device, real>::compute(false,
-                                        false,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        filterData + g * filterOffset,
-                                        K,
-                                        colData,
-                                        N,
-                                        beta,
-                                        outputData + g * outputOffset,
-                                        N);
-      }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifdef PADDLE_MOBILE_INFERENCE
-
-/*
- * \brief Forward calculation of convolution, optimized for mobile.
- */
-template <DeviceType Device>
-class GemmConvMobileFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // TODO(hedaoyuan): Need to define some index macros,
-    // to avoid useing 0 and 1.
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-    real* colData = NULL;
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape;
-
-    // Max col matrix width 4096, Max col matrix size 4M.
-    size_t outputHeightSteps =
-        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
-    size_t maxColWidth = outputHeightSteps * outputWidth;
-    size_t channelSteps =
-        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
-                          (size_t)1),
-                 inputChannels / groups_);
-    size_t maxColHeight = channelSteps * filterHeight * filterWidth;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-
-      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColMobileFunctor<real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    int nStride = outputHeight * outputWidth;
-    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
-    for (size_t i = 0; i < batchSize; i++) {
-      filterData = inputs[1].data<real>();
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          real beta_ = beta;
-          for (size_t ic = 0; ic < inputChannels / groups_;
-               ic += channelSteps) {
-            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
-            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
-              int height = std::min(outputHeight - oh, outputHeightSteps);
-
-              int M = outputChannels / groups_;
-              int N = height * outputWidth;
-              int K = channels * filterHeight * filterWidth;
-              // im2col
-              im2col(inputData,
-                     imShape,
-                     colData,
-                     colShape,
-                     strideH(),
-                     strideW(),
-                     paddingH(),
-                     paddingW(),
-                     dilationH(),
-                     dilationW(),
-                     channels,
-                     oh,
-                     height,
-                     N);
-
-              // gemm
-              BlasGemm<Device, real>::compute(
-                  false,
-                  false,
-                  M,
-                  N,
-                  K,
-                  1.0f,
-                  filterData + ic * filterHeight * filterWidth,
-                  kStride,
-                  colData,
-                  N,
-                  beta_,
-                  outputData + oh * outputWidth,
-                  nStride);
-            }
-            beta_ = 1.0;
-          }
-        } else {
-          int M = outputChannels / groups_;
-          int N = outputHeight * outputWidth;
-          int K = inputChannels / groups_ * filterHeight * filterWidth;
-          BlasGemm<Device, real>::compute(false,
-                                          false,
-                                          M,
-                                          N,
-                                          K,
-                                          1.0f,
-                                          filterData,
-                                          K,
-                                          inputData,
-                                          N,
-                                          beta,
-                                          outputData,
-                                          N);
-        }
-        inputData += inputOffset;
-        outputData += outputOffset;
-        filterData += filterOffset;
-      }
-    }
-
-    memory_.reset();
-  }
-};
-
-#endif
-
-/*
- * \brief Backward input calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvGradInputFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // Since the implementation of Col2ImFunctor is ADD_TO,
-    // this function only supports ADD_TO mode.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* outputGrad = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* inputGrad = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Col2ImFunctor<kCFO, Device, real> col2im;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        int K = outputChannels / groups_;
-        int N = outputHeight * outputWidth;
-        int M = inputChannels / groups_ * filterHeight * filterWidth;
-        real scale = 0.0f;
-        if (!needIm2col) {
-          colData = inputGrad + g * inputOffset;
-          scale = 1.0f;
-        }
-        BlasGemm<Device, real>::compute(true,
-                                        false,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        filterData + g * filterOffset,
-                                        M,
-                                        outputGrad + g * outputOffset,
-                                        N,
-                                        scale,
-                                        colData,
-                                        N);
-        if (needIm2col) {
-          col2im(inputGrad + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        }
-      }
-      inputGrad += inputChannels * inputHeight * inputWidth;
-      outputGrad += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-/*
- * \brief Backward filter calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvGradFilterFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* outputGrad = inputs[0].data<real>();
-    real* inputData = inputs[1].data<real>();
-    real* filterGrad = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColFunctor<kCFO, Device, real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          im2col(inputData + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        } else {
-          colData = inputData + g * inputOffset;
-        }
-        int M = outputChannels / groups_;
-        int K = outputHeight * outputWidth;
-        int N = inputChannels / groups_ * filterHeight * filterWidth;
-        BlasGemm<Device, real>::compute(false,
-                                        true,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        outputGrad + g * outputOffset,
-                                        K,
-                                        colData,
-                                        K,
-                                        i == 0 ? beta : 1.0f,
-                                        filterGrad + g * filterOffset,
-                                        N);
-      }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputGrad += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifdef PADDLE_MOBILE_INFERENCE
-REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
-#else
-REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
-#endif
-REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
-REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
-REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
-REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/GemmFunctor.cpp b/paddle/function/GemmFunctor.cpp
deleted file mode 100644
index 0b1fe1b67d8fd6caf86a08bc05e250b1936e9f85..0000000000000000000000000000000000000000
--- a/paddle/function/GemmFunctor.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GemmFunctor.h"
-#include "paddle/math/MathFunctions.h"
-
-namespace paddle {
-
-template <class T>
-struct BlasGemm<DEVICE_TYPE_CPU, T> {
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-#ifdef PADDLE_USE_EIGEN_FOR_BLAS
-    EigenBlasGemm<T>::compute(
-        transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
-#else
-    gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
-            transB == false ? CblasNoTrans : CblasTrans,
-            M,
-            N,
-            K,
-            alpha,
-            A,
-            lda,
-            B,
-            ldb,
-            beta,
-            C,
-            ldc);
-#endif
-  }
-};
-
-template <class T>
-struct BlasGemm<DEVICE_TYPE_GPU, T> {
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-    hl_matrix_mul((T*)A,
-                  transA == false ? HPPL_OP_N : HPPL_OP_T,
-                  (T*)B,
-                  transB == false ? HPPL_OP_N : HPPL_OP_T,
-                  C,
-                  M,
-                  N,
-                  K,
-                  alpha,
-                  beta,
-                  lda,
-                  ldb,
-                  ldc);
-  }
-};
-
-template struct BlasGemm<DEVICE_TYPE_CPU, real>;
-template struct BlasGemm<DEVICE_TYPE_GPU, real>;
-
-}  // namespace paddle
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
deleted file mode 100644
index 967c5b91536608364b4181707b843799b1764c3f..0000000000000000000000000000000000000000
--- a/paddle/function/Im2ColTest.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Im2Col.h"
-#include <gtest/gtest.h>
-#include "Function.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/tests/TensorCheck.h"
-
-namespace paddle {
-
-template <DeviceType Device, class T>
-void TestIm2ColFunctor() {
-  for (size_t channels : {1, 5, 32}) {
-    for (size_t inputHeight : {5, 33, 100}) {
-      for (size_t inputWidth : {5, 32, 96}) {
-        for (size_t filterHeight : {1, 5}) {
-          for (size_t filterWidth : {3, 7}) {
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                for (size_t dilation : {1, 3}) {
-                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
-                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
-                  if (inputHeight + 2 * padding < filterSizeH ||
-                      inputWidth + 2 * padding < filterSizeW)
-                    break;
-                  if (padding >= filterSizeH || padding >= filterSizeW) break;
-                  size_t outputHeight =
-                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
-                  size_t outputWidth =
-                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
-
-                  TensorShape imShape =
-                      TensorShape({channels, inputHeight, inputWidth});
-                  TensorShape colShape1 = TensorShape({channels,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       outputHeight,
-                                                       outputWidth});
-                  TensorShape colShape2 = TensorShape({outputHeight,
-                                                       outputWidth,
-                                                       channels,
-                                                       filterHeight,
-                                                       filterWidth});
-
-                  size_t height = channels * filterHeight * filterWidth;
-                  size_t width = outputHeight * outputWidth;
-                  VectorPtr input1 =
-                      Vector::create(imShape.getElements(), false);
-                  VectorPtr input2 =
-                      Vector::create(imShape.getElements(), false);
-                  MatrixPtr output1 =
-                      Matrix::create(height, width, false, false);
-                  MatrixPtr output2 =
-                      Matrix::create(width, height, false, false);
-                  input1->uniform(0.001, 1);
-                  input2->copyFrom(*input1);
-
-                  Im2ColFunctor<kCFO, Device, T> im2Col1;
-                  Im2ColFunctor<kOCF, Device, T> im2Col2;
-                  im2Col1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  im2Col2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape2,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-
-                  // The transposition of the result of ColFormat == kCFO
-                  // is equal to the result of ColFormat == kOCF.
-                  MatrixPtr test;
-                  output2->transpose(test, true);
-                  autotest::TensorCheckErr(*output1, *test);
-
-                  Col2ImFunctor<kCFO, Device, T> col2Im1;
-                  Col2ImFunctor<kOCF, Device, T> col2Im2;
-
-                  col2Im1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  col2Im2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape2,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  autotest::TensorCheckErr(*input1, *input2);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
-
-#ifdef PADDLE_WITH_CUDA
-
-TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
-
-#endif
-
-template <class T>
-void TestIm2ColMobileFunctor() {
-  for (size_t channels : {32}) {
-    for (size_t inputHeight : {33, 100}) {
-      for (size_t inputWidth : {32, 96}) {
-        for (size_t filterHeight : {5}) {
-          for (size_t filterWidth : {7}) {
-            for (size_t stride : {2}) {
-              for (size_t padding : {1}) {
-                for (size_t dilation : {1, 3}) {
-                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
-                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
-                  if (inputHeight + 2 * padding < filterSizeH ||
-                      inputWidth + 2 * padding < filterSizeW)
-                    break;
-                  if (padding >= filterSizeH || padding >= filterSizeW) break;
-                  size_t outputHeight =
-                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
-                  size_t outputWidth =
-                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
-
-                  TensorShape imShape =
-                      TensorShape({channels, inputHeight, inputWidth});
-                  TensorShape colShape1 = TensorShape({channels,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       outputHeight,
-                                                       outputWidth});
-
-                  size_t height = channels * filterHeight * filterWidth;
-                  size_t width = outputHeight * outputWidth;
-                  VectorPtr input1 =
-                      Vector::create(imShape.getElements(), false);
-                  VectorPtr input2 =
-                      Vector::create(imShape.getElements(), false);
-                  MatrixPtr output1 =
-                      Matrix::create(height, width, false, false);
-                  MatrixPtr output2 =
-                      Matrix::create(height, width, false, false);
-                  input1->uniform(0.001, 1);
-                  input2->copyFrom(*input1);
-
-                  Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
-                  Im2ColMobileFunctor<T> im2Col2;
-                  im2Col1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  im2Col2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation,
-                          channels,
-                          0,
-                          outputHeight,
-                          outputHeight * outputWidth);
-
-                  autotest::TensorCheckEqual(*output1, *output2);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
-
-}  // namespace paddle
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
deleted file mode 100644
index 7bf36c8050a8c33d836ce98dc7f3cf6d3de38d55..0000000000000000000000000000000000000000
--- a/paddle/function/MulOp.cpp
+++ /dev/null
@@ -1,347 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MulOp.h"
-#include "GemmFunctor.h"
-#include "paddle/math/SIMDFunctions.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace {
-inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += (1.0 == scaleB) ? b[i] : scaleB * b[i];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += (1.0 == c) ? b[i * bWidth] : b[i * bWidth] * c;
-  }
-}
-}  // namespace
-
-namespace paddle {
-/// sparse matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK_EQ(out.getValueType(), FLOAT_VALUE);
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  const real* A = a.getData();
-  const real* B = b.getData();
-  real* C = out.getValue();
-  int* rows = out.getRows();
-  int* cols = out.getCols();
-  size_t width = out.getWidth();
-  size_t height = out.getHeight();
-
-  /// SPARSE_CSC, {a any, b not trans}
-  if (out.getFormat() == SPARSE_CSC) {
-    /// b not trans and a any
-    CHECK(!bTrans);
-    size_t m = !aTrans ? a.getWidth() : a.getHeight();
-    for (size_t i = 0; i < width; i++) {
-      size_t start = out.getColStartIdx(i);
-      size_t end = out.getColStartIdx(i + 1);
-      for (size_t j = start; j < end; j++) {
-        real sum = 0;
-        size_t rowIdx = rows[j];
-        for (size_t k = 0; k < m; k++) {
-          sum += (!aTrans ? A[rowIdx * m + k] : A[k * height + rowIdx]) *
-                 B[k * width + i];
-        }
-        C[j] = scaleAB * sum + scaleT * C[j];
-      }
-    }
-    return;
-  }
-
-  /// SPARSE_CSR, {a any, b not trans} or {a not trans, b trans}
-  if (out.getFormat() == SPARSE_CSR) {
-    /// a and b can not both transpose
-    CHECK(!(aTrans && bTrans));
-    size_t m = a.getWidth();
-    for (size_t i = 0; i < height; i++) {
-      size_t start = out.getRowStartIdx(i);
-      size_t end = out.getRowStartIdx(i + 1);
-      for (size_t j = start; j < end; j++) {
-        real sum = 0;
-        size_t colIdx = cols[j];
-        for (size_t k = 0; k < m; k++) {
-          sum += (!aTrans ? A[i * m + k] : A[k * height + i]) *
-                 (!bTrans ? B[k * width + colIdx] : B[colIdx * m + k]);
-        }
-        C[j] = scaleAB * sum + scaleT * C[j];
-      }
-    }
-    return;
-  }
-}
-
-/// dense matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
-      aTrans,
-      bTrans,
-      out.getHeight(),
-      out.getWidth(),
-      !aTrans ? a.getWidth() : a.getHeight(),
-      scaleAB,
-      a.getData(),
-      a.getStride(),
-      b.getData(),
-      b.getStride(),
-      scaleT,
-      out.getData(),
-      out.getStride());
-}
-
-/// dense matrix (+)= sparse matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuSparseMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  const real* B = b.getData();
-  real* C = out.getData();
-  if (out.getWidth() % 32 == 0) {
-    CHECK_EQ((size_t)B % 32, 0UL);
-    CHECK_EQ((size_t)C % 32, 0UL);
-  }
-
-  int* cols = a.getCols();
-  real* values = a.getValue();
-  for (size_t i = 0; i < a.getHeight(); ++i) {
-    const int start = a.getRowStartIdx(i);
-    const int end = a.getRowStartIdx(i + 1);
-    for (int j = start; j < end; ++j) {
-      vecAddTo(!aTrans ? out.getRow(i) : out.getRow(cols[j]),
-               !aTrans ? const_cast<CpuMatrix&>(b).getRow(cols[j])
-                       : const_cast<CpuMatrix&>(b).getRow(i),
-               (a.getValueType() == FLOAT_VALUE) ? values[j] : (real)1.0,
-               out.getWidth());
-    }
-  }
-}
-
-/// dense matrix (+)= dense matrix * sparse matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuSparseMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  real* A = const_cast<real*>(a.getData());
-  real* B = const_cast<real*>(b.getValue());
-  real* C = out.getData();
-  int* rows = b.getRows();
-  int* cols = b.getCols();
-
-  /// SPARSE_CSC format
-  if (b.getFormat() == SPARSE_CSC) {
-    for (size_t j = 0; j < b.getWidth(); ++j) {
-      int start = b.getColStartIdx(j);
-      int end = b.getColStartIdx(j + 1);
-      for (int i = start; i < end; ++i) {
-        colVecAddTo(!bTrans ? C + j : C + rows[i],
-                    !bTrans ? A + rows[i] : A + j,
-                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
-                    out.getHeight(),
-                    out.getWidth(),
-                    a.getWidth());
-      }
-    }
-    return;
-  }
-
-  /// SPARSE_CSR format
-  if (b.getFormat() == SPARSE_CSR) {
-    for (size_t j = 0; j < b.getHeight(); ++j) {
-      int start = b.getRowStartIdx(j);
-      int end = b.getRowStartIdx(j + 1);
-      for (int i = start; i < end; ++i) {
-        colVecAddTo(!bTrans ? C + cols[i] : C + j,
-                    !bTrans ? A + j : A + cols[i],
-                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
-                    out.getHeight(),
-                    out.getWidth(),
-                    a.getWidth());
-      }
-    }
-    return;
-  }
-}
-
-/**
- * mul operator
- * out = scaleT * out + scaleAB * (A * B)
- * here, scaleT in {0, 1}, scaleAB == 1,
- * out = A * B, ASSIGN_TO
- * out += A * B, ADD_TO
- *
- *
- * \param outputs[0]      output matrix (out), M * N,
- *                        could be either Sparse or Dense Matrix
- *                        M is num of rows, N is num of columns
- * \param inputs[0]       first input matrix (A),  M * K (if non-trans)
- *                        could be either Sparse or Dense Matrix
- *                        M is num of rows, K is num of columns
- * \param inputs[1]       second input matrix (B), K * N (if non-trans)
- *                        could be either Sparse or Dense Matrix
- *                        K is num of rows, N is num of columns
- *
- * Support eight Mul operators, with both GPU and CPU devices
- * For each device, four Mul operators are supported:
- * 1. dense (out) = dense (A) * dense (B)
- * 2. dense (out) = sparse (A) * dense (B)
- *    sparse matrix only support SPARSE_CSR format
- * 3. dense (out) = dense (A) * sparse (B)
- *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
- * 4. sparse (out) = dense (A) * dense (B)
- *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
- *
- */
-template <DeviceType Device>
-class MulFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    aTrans_ = config.get<bool>("aTrans");
-    bTrans_ = config.get<bool>("bTrans");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK(!aTrans_ || !bTrans_)
-        << "Not support both a and b are transpose matrices";
-
-    CHECK_EQ((size_t)2, inputs.size());
-    CHECK_EQ((size_t)1, outputs.size());
-    CHECK(inputs[0].data() && inputs[1].data() && outputs[0].data());
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-
-    size_t aRow = !aTrans_ ? inputs[0].shape()[0] : inputs[0].shape()[1];
-    size_t aCol = !aTrans_ ? inputs[0].shape()[1] : inputs[0].shape()[0];
-    size_t bRow = !bTrans_ ? inputs[1].shape()[0] : inputs[1].shape()[1];
-    size_t bCol = !bTrans_ ? inputs[1].shape()[1] : inputs[1].shape()[0];
-    /// C = A * B, or C += A * B, for matrix format
-    CHECK_EQ(aCol, bRow);
-    CHECK_EQ(aRow, outputs[0].shape()[0]);
-    CHECK_EQ(bCol, outputs[0].shape()[1]);
-
-    /// only support C = A * B (ASSIGN_TO) or C += A * B (ADD_TO)
-    real scaleT = (outputs[0].getArgType() == ADD_TO) ? 1.0 : 0.0;
-
-    /// support dense = not both sparse * sparse
-    /// or sparse = dense * dense
-    CHECK((!outputs[0].isSparseArg() &&
-           !(inputs[0].isSparseArg() && inputs[1].isSparseArg())) ||
-          (outputs[0].isSparseArg() && !inputs[0].isSparseArg() &&
-           !inputs[1].isSparseArg()));
-
-    auto outMat = outputs[0].matrix<Device>();
-    /// dense matrix = dense matrix * dense matrix
-    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      MulOp<Device>(outMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// dense matrix = dense matrix * sparse matrix
-    if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      CHECK(!aTrans_) << "Not supported a transpose";
-      MulOp<Device>(outMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].sparse().SparseMatrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// dense matrix = sparse matrix * dense matrix
-    if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      CHECK(!bTrans_) << "Not supported b transpose";
-      CHECK_EQ(inputs[0].sparse().dataFormat(), T_SPARSE_CSR)
-          << "Only supported SPARSE_CSR format for sparse matrix a";
-      MulOp<Device>(outMat,
-                    inputs[0].sparse().SparseMatrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// sparse matrix = dense matrix * dense matrix
-    auto outSparseMat = outputs[0].sparse().SparseMatrix<Device>();
-    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        outputs[0].isSparseArg()) {
-      MulOp<Device>(outSparseMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-  }
-
- private:
-  bool aTrans_;
-  bool bTrans_;
-};
-
-REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/function/MulOp.h b/paddle/function/MulOp.h
deleted file mode 100644
index e6057be4e54b3cc2b3502b9a93825d4b53037c91..0000000000000000000000000000000000000000
--- a/paddle/function/MulOp.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-/// CPU, dense matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, dense matrix (+)= sparse matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuSparseMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, dense matrix (+)= dense matrix * sparse matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuMatrix& a,
-           const CpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, sparse matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuSparseMatrix& out,
-           const CpuMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= sparse matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuSparseMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= dense matrix * sparse matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuMatrix& a,
-           const GpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, sparse matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuSparseMatrix& out,
-           const GpuMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-}  // namespace paddle
diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu
deleted file mode 100644
index d63416a8e45346089bac23100742b8afc99b8e77..0000000000000000000000000000000000000000
--- a/paddle/function/MulOpGpu.cu
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MulOp.h"
-#include "hl_base.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-/// dense matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_matrix_mul(const_cast<real*>(a.getData()),
-                !aTrans ? HPPL_OP_N : HPPL_OP_T,
-                const_cast<real*>(b.getData()),
-                !bTrans ? HPPL_OP_N : HPPL_OP_T,
-                const_cast<real*>(out.getData()),
-                out.getHeight(),
-                out.getWidth(),
-                !aTrans ? a.getWidth() : a.getHeight(),
-                scaleAB,
-                scaleT,
-                a.getStride(),
-                b.getStride(),
-                out.getStride());
-}
-
-/// dense matrix (+)= sparse matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuSparseMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(out.isContiguous());
-  CHECK(b.isContiguous());
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_matrix_csr_mul_dense(a.sMatrix_.get(),
-                          aTrans ? HPPL_OP_T : HPPL_OP_N,
-                          const_cast<real*>(b.getData()),
-                          HPPL_OP_N,
-                          const_cast<real*>(out.getData()),
-                          out.getHeight(),
-                          out.getWidth(),
-                          b.getHeight(),
-                          scaleAB,
-                          scaleT);
-}
-
-/// dense matrix (+)= dense matrix * sparse matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuSparseMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(out.isContiguous());
-  CHECK(a.isContiguous());
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-
-  if (b.format_ == SPARSE_CSC) {
-    hl_matrix_dense_mul_csc(const_cast<real*>(a.getData()),
-                            HPPL_OP_N,
-                            b.sMatrix_.get(),
-                            bTrans ? HPPL_OP_T : HPPL_OP_N,
-                            const_cast<real*>(out.getData()),
-                            out.getHeight(),
-                            out.getWidth(),
-                            a.getWidth(),
-                            scaleAB,
-                            scaleT);
-  } else {
-    hl_matrix_dense_mul_csr(const_cast<real*>(a.getData()),
-                            HPPL_OP_N,
-                            b.sMatrix_.get(),
-                            bTrans ? HPPL_OP_T : HPPL_OP_N,
-                            const_cast<real*>(out.getData()),
-                            out.getHeight(),
-                            out.getWidth(),
-                            a.getWidth(),
-                            scaleAB,
-                            scaleT);
-  }
-}
-
-/// sparse matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_sparse_matrix_mul(const_cast<real*>(a.getData()),
-                       aTrans ? HPPL_OP_T : HPPL_OP_N,
-                       const_cast<real*>(b.getData()),
-                       bTrans ? HPPL_OP_T : HPPL_OP_N,
-                       out.sMatrix_.get(),
-                       out.getHeight(),
-                       out.getWidth(),
-                       !bTrans ? b.getHeight() : b.getWidth(),
-                       scaleAB,
-                       scaleT);
-}
-
-}  // namespace paddle
diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp
deleted file mode 100644
index 4e1ebd749c0cd083c025e43a321d6992a11786ff..0000000000000000000000000000000000000000
--- a/paddle/function/MulOpTest.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/math/tests/test_matrixUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-/**
- *  C += A * B, A, B, C dense matrix
- *  dense = dense * dense
- */
-void testFuncDDDMatrix(
-    bool transa, bool transb, size_t dimM, size_t dimN, size_t dimK) {
-  real scaleT = 1.0;
-  size_t heightA = (transa == false) ? dimM : dimK;
-  size_t widthA = (transa == false) ? dimK : dimM;
-  size_t heightB = (transb == false) ? dimK : dimN;
-  size_t widthB = (transb == false) ? dimN : dimK;
-  size_t heightC = dimM;
-  size_t widthC = dimN;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
-  // prepare input arguments
-  /// matrix A : HA * WA
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}));
-  /// matrix B: HB * WB
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}));
-
-  /// output matrix C: HC * WC
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, DDDMatrixMul) {
-  LOG(INFO) << "function test for dense = dense * dense matrix";
-  for (const auto transa : {false, true}) {
-    for (const auto transb : {false, true}) {
-      for (const auto dimM : {1, 10, 100}) {
-        for (const auto dimN : {1, 10}) {
-          for (const auto dimK : {8}) {
-            if (transa && transb) {
-              continue;
-            }
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " transa=" << transa << " transb=" << transb
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK;
-            testFuncDDDMatrix(transa, transb, dimM, dimN, dimK);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, B, C dense, A sparse
- * dense = sparse * dense
- */
-void testFuncDSparseDMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// sparse matrix A : M * K
-  test.addInputs(SparseMatrixArg(
-      VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE));
-  /// matrix B: K * N
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
-
-  /// output matrix C: M * N
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MuLOp, DSparseDMul) {
-  LOG(INFO) << "function test for dense = sparse * dense matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSR}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncDSparseDMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, A, C dense, B sparse
- * dense = dense * sparse
- */
-void testFuncDDSparseMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// matrix A : M * K
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
-
-  /// matrix B: K * N
-  test.addInputs(SparseMatrixArg(
-      VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE));
-
-  /// output matrix C: M * N
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, DDSparseMul) {
-  LOG(INFO) << "function test for dense = dense * sparse matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSR, SPARSE_CSC}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncDDSparseMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, A sparse, B, C dense
- * sparse = dense * dense
- */
-void testFuncSparseDDMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// matrix A : M * K
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
-
-  /// matrix B: K * N
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
-
-  /// output sparse matrix C: M * N
-  test.addOutputs(
-      SparseMatrixArg(
-          VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE),
-      scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, SparseDDMul) {
-  LOG(INFO) << "function test for sparse = dense * dense matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSC, SPARSE_CSR}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncSparseDDMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
deleted file mode 100644
index 5d7515e8c053439b95fb18de3c8ffe70705600a3..0000000000000000000000000000000000000000
--- a/paddle/function/PadOp.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PadOp.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void Pad<DEVICE_TYPE_CPU>(real* outputs,
-                          const real* inputs,
-                          const int num,
-                          const int inC,
-                          const int inH,
-                          const int inW,
-                          const PadConf& pad) {
-  int cstart = pad.channel[0], cend = pad.channel[1];
-  int hstart = pad.height[0], hend = pad.height[1];
-  int wstart = pad.width[0], wend = pad.width[1];
-  int outC = inC + cstart + cend;
-  int outH = inH + hstart + hend;
-  int outW = inW + wstart + wend;
-  for (int i = 0; i < num; i++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int inoff = ((i * inC + c) * inH + h) * inW;
-        int outoff =
-            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
-        memcpy(outputs + outoff, inputs + inoff, inW * sizeof(real));
-      }
-    }
-  }
-}
-
-template <>
-void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
-                              const real* outGrad,
-                              const int num,
-                              const int inC,
-                              const int inH,
-                              const int inW,
-                              const PadConf& pad) {
-  int cstart = pad.channel[0], cend = pad.channel[1];
-  int hstart = pad.height[0], hend = pad.height[1];
-  int wstart = pad.width[0], wend = pad.width[1];
-  int outC = inC + cstart + cend;
-  int outH = inH + hstart + hend;
-  int outW = inW + wstart + wend;
-  for (int i = 0; i < num; i++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int inoff = ((i * inC + c) * inH + h) * inW;
-        int outoff =
-            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
-        CpuVector inG = CpuVector(inW, inGrad + inoff);
-        CpuVector outG = CpuVector(inW, const_cast<real*>(outGrad + outoff));
-        inG += outG;
-      }
-    }
-  }
-}
-
-static inline PadConf castToPadConf(const FuncConfig& conf) {
-  return {conf.get<std::vector<uint32_t>>("channel"),
-          conf.get<std::vector<uint32_t>>("height"),
-          conf.get<std::vector<uint32_t>>("width")};
-}
-
-/**
- * \brief Padding zeros to input according to the specify dimension.
- *        The struct pad_ contains the padding size in each dimension.
- *        The input and output is a 4D tensor. In PadFunc, we only
- *        pad zeros to the 2nd to 4th dimension.
- *
- * Argument in this Function:
- * \param pad_    A struct object contains the padding size in each dimension.
- *                It has six integers. The channelStart and channelEnd indicate
- *                how many zeros to add before and after the input in channel
- *                dimension. And the heightStart and heightEnd indicate padding
- *                in height dimension. The widthStart and widthEnd indicate the
- *                padding in width dimension.
- * \param inputs  A 4D tensor, only one input.
- * \param outputs A 4D tensor, the output value after padding.
- *
- * For example,
- * Input(2,2,2,3) = [
- *                    [ [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]] ],
- *                    [ [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]] ]
- *                  ] # the shape is (1,2,2,3)
- *
- * pad_: if channelStart = channelEnd = 1, others are 0.
- * Output(2,4,2,3) = [
- *                    [ [[0,0,0], [0,0,0]],
- *                      [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]],
- *                      [[0,0,0], [0,0,0]] ],
- *                    [ [[0,0,0], [0,0,0]],
- *                      [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]],
- *                      [[0,0,0], [0,0,0]] ]
- *                   ] # the shape is (2,4,2,3)
- *
- * pad_: if widthStart = 1, widthEnd = 2, others are 0.
- * Output(2,2,2,6) = [
- *                     [ [[0,1,2,3,0,0], [0,3,4,5,0,0]],
- *                       [[0,2,3,5,0,0], [0,1,6,7,0,0]] ],
- *                     [ [[0,4,3,1,0,0], [0,1,8,7,0,0]],
- *                       [[0,3,8,9,0,0], [0,2,3,5,0,0]] ],
- *                   ] # the shape is (2,2,2,6)
- *
- * pad_: if heightStart = 1, heightEnd = 1, others are 0.
- * Output(2,2,4,3) = [
- *                     [ [[0,0,0], [1,2,3], [3,4,5], [0,0,0]],
- *                       [[0,0,0], [2,3,5], [1,6,7], [0,0,0]] ],
- *                     [ [[0,0,0], [4,3,1], [1,8,7], [0,0,0]],
- *                       [[0,0,0], [3,8,9], [2,3,5], [0,0,0]] ],
- *                   ] # the shape is (2,2,4,3)
- */
-
-template <DeviceType Device>
-class PadFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    size_t num = inputs[0].shape()[0];
-    size_t inC = inputs[0].shape()[1];
-    size_t inH = inputs[0].shape()[2];
-    size_t inW = inputs[0].shape()[3];
-    typename Tensor<real, Device>::Vector vec(outputs[0].shape().getElements(),
-                                              outputs[0].data<real>());
-    vec.zero();
-
-    Pad<Device>(outputs[0].data<real>(),
-                inputs[0].data<real>(),
-                num,
-                inC,
-                inH,
-                inW,
-                pad_);
-  }
-
- private:
-  PadConf pad_;
-};
-
-/**
- * \brief The backward propagation of padding Function. Remove the elements
- *        in the padding positions of forward.
- *
- * Argument in this Function:
- * \param pad_    The same meaning as it in PadFunc.
- * \param inputs  The gradient with respect to the output value of PadFunc.
- * \param outputs The gradient with respect to the input value of PadFunc.
- */
-
-template <DeviceType Device>
-class PadGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = outputs[0].shape()[0];
-    size_t inC = outputs[0].shape()[1];
-    size_t inH = outputs[0].shape()[2];
-    size_t inW = outputs[0].shape()[3];
-
-    if (outputs[0].getArgType() != ADD_TO) {
-      // for unit test
-      typename Tensor<real, Device>::Vector tmp(
-          outputs[0].shape().getElements(), outputs[0].data<real>());
-      tmp.zero();
-    }
-
-    PadGrad<Device>(outputs[0].data<real>(),
-                    inputs[0].data<real>(),
-                    num,
-                    inC,
-                    inH,
-                    inW,
-                    pad_);
-  }
-
- private:
-  PadConf pad_;
-};
-
-REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
-REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
-REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp
deleted file mode 100644
index 129e9334582fad011c259e8ab8268b00a7fab7b6..0000000000000000000000000000000000000000
--- a/paddle/function/RowConvOp.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RowConvOp.h"
-#include <iostream>
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void RowConv<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                              const CpuMatrix& in,
-                              const CpuMatrix& filter,
-                              const CpuIVector& seq) {
-  const int* starts = seq.getData();
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  for (size_t i = 0; i < numSeq; ++i) {
-    size_t begin = starts[i];
-    size_t end = starts[i + 1];
-    for (size_t j = begin; j < end; ++j) {
-      MatrixPtr x;
-      MatrixPtr w;
-      if ((j + contextLength) < end) {
-        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, contextLength);
-        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, contextLength);
-      } else {
-        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, end - j);
-        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, end - j);
-      }
-      MatrixPtr y = out.subMatrix(j, 1);
-      y->addDotMulVMM(*x, *w);
-    }
-  }
-}
-
-template <>
-void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
-                                  const CpuMatrix& in,
-                                  const CpuMatrix& filter,
-                                  CpuMatrix& inG,
-                                  CpuMatrix& filterG,
-                                  const CpuIVector& seq) {
-  // gradient w.r.t filter
-  const int* starts = seq.getData();
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  if (filterG) {
-    for (size_t i = 0; i < numSeq; ++i) {
-      size_t begin = starts[i];
-      size_t end = starts[i + 1];
-      size_t steps = end - begin;
-      for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) {
-        MatrixPtr x =
-            (const_cast<CpuMatrix&>(in)).subMatrix(begin + j, steps - j);
-        MatrixPtr dy =
-            (const_cast<CpuMatrix&>(outG)).subMatrix(begin, steps - j);
-        MatrixPtr dw = filterG.subMatrix(j, 1);
-        dw->addDotMulVMM(*dy, *x);
-      }
-    }
-  }
-
-  // gradient w.r.t input feature
-  if (inG) {
-    for (size_t i = 0; i < numSeq; ++i) {
-      size_t begin = starts[i];
-      size_t end = starts[i + 1];
-      size_t steps = end - begin;
-      for (size_t j = 0; j < steps; ++j) {
-        MatrixPtr dx = inG.subMatrix(begin + j, 1);
-        for (size_t t = 0; t < contextLength; ++t) {
-          if (int(j - t) >= 0) {
-            MatrixPtr dy =
-                (const_cast<CpuMatrix&>(outG)).subMatrix(begin + j - t, 1);
-            MatrixPtr w = (const_cast<CpuMatrix&>(filter)).subMatrix(t, 1);
-            dx->addDotMul(*dy, *w, 1.0, 1.0);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief The row convolution is called lookahead convolution. It is firstly
- * introduced in deep-speech2 system. The bidirectional RNN that learns
- * representation for a sequence by performing a forward and a backward pass
- * through the entire sequence. However, unlike unidirectional RNNs,
- * bidirectional RNNs are challenging to deploy in an online and low-latency
- * setting. The lookahead convolution incorporates information from future
- * subsequences in a computationally efficient manner to improve unidirectional
- * recurrent neural networks.
- *
- * The connection of row convolution is different form the 1D sequence
- * convolution. Assumed that, the future context-length is k, that is to say,
- * it can get the output at timestep t by using the the input feature from t-th
- * timestep to (t+k)-th timestep. Assumed that the hidden dim of input
- * activations are d, the activations r_t for the new layer at time-step t are:
- *
- *
- *            -- k + 1
- *  r(t,i) =  >       W(i,j) * h(t+j-1, i),  for (1 <= i <= d)
- *            -- j = 1
- *
- *
- * The weight shape is: (k + 1) x d
- * Function Arguments:
- *
- * \param inputs[0]  The input activations.
- * \param inputs[0]  The filter (or weight) and shape is (k+1) x d.
- * \param outputs[1] The output activations.
- *
- * [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
- * English
- *     and Mandarin. https://arxiv.org/abs/1512.02595
- */
-
-template <DeviceType Device>
-class RowConvFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    // check
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    // TODO(qingqing): support ASSIGN_TO.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here.";
-    const auto in = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out = dynamic_cast<const SequenceArg&>(outputs[0]);
-    auto w = inputs[1];
-    CHECK(in.data() && out.data() && in.getSequenceId().data());
-    CHECK_EQ(in.shape().ndims(), 2UL);
-    CHECK(in.shape() == out.shape());
-    CHECK_EQ(w.shape()[1], in.shape()[1]);
-
-    auto outMat = out.matrix<Device>();
-    const auto inMat = in.matrix<Device>();
-    const auto wMat = w.matrix<Device>();
-    const auto seqId = in.getSequenceId().vector<int, Device>();
-
-    RowConv<Device>(outMat, inMat, wMat, seqId);
-  }
-};
-
-/**
- * \brief The backward of row convolution function. This function calculated
- * the gradient w.r.t filter and the gradient w.r.t input activations(or data).
- *
- * Argument in this Function:
- *
- * \param inputs[0]  The gradient w.r.t output activations.
- * \param inputs[1]  The input activations.
- * \param inputs[2]  The filter (or weight) and shape is (k+1) x d.
- * \param outputs[0] The gradient w.r.t input activations.
- * \param outputs[1] The gradient w.r.r filter.
- *
- * Abbreviation:
- * w.r.t: with respect to.
- */
-
-template <DeviceType Device>
-class RowConvGradFunc : public FunctionBase {
-  // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    // check
-    CHECK_EQ(3UL, inputs.size());
-    CHECK_EQ(2UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-    CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() &&
-          outputs[0].isSequenceArg())
-        << "SequenceArg required here.";
-
-    const auto outGrad = dynamic_cast<const SequenceArg&>(inputs[0]);
-    const auto in = dynamic_cast<const SequenceArg&>(inputs[1]);
-    const auto w = inputs[2];
-    auto inGrad = dynamic_cast<const SequenceArg&>(outputs[0]);
-    auto wGrad = outputs[1];
-
-    CHECK_EQ(in.shape().ndims(), 2UL);
-    CHECK(in.shape() == inGrad.shape());
-    CHECK(in.shape() == outGrad.shape());
-    CHECK_EQ(wGrad.shape()[1], in.shape()[1]);
-
-    const auto outGMat = outGrad.matrix<Device>();
-    const auto inMat = in.matrix<Device>();
-    const auto wMat = w.matrix<Device>();
-    auto inGMat = inGrad.data()
-                      ? inGrad.matrix<Device>()
-                      : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    auto wGMat = wGrad.data()
-                     ? wGrad.matrix<Device>()
-                     : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    const auto seqId = in.getSequenceId().vector<int, Device>();
-
-    RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
-  }
-};
-
-REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
-REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
-REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu
deleted file mode 100644
index f820ee9a9713ce17547aa03945dc3c291ef50a59..0000000000000000000000000000000000000000
--- a/paddle/function/RowConvOpGpu.cu
+++ /dev/null
@@ -1,373 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/cuda/include/hl_base.h"
-#include "paddle/function/RowConvOp.h"
-
-namespace paddle {
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConv(real* y,
-                          const real* x,
-                          const real* w,
-                          const int* starts,
-                          const int height,
-                          const int width,
-                          const int numSeq,
-                          const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sw[BLOCK_H][BLOCK_W];
-
-  for (int i = tidy; i < context; i += blky) {
-    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
-  }
-
-  __syncthreads();
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context; ++t) {
-        if ((start + j + t) < end) {
-          int xoff = off + t * width;
-          real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
-          sum += sw[t][tidx] * xVal;
-        }
-      }
-      if (gidx + tidx < width) {
-        y[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-__global__ void KeRowConv2(real* y,
-                           const real* x,
-                           const real* w,
-                           const int* starts,
-                           const int height,
-                           const int width,
-                           const int numSeq,
-                           const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      int off = (start + j) * width;
-      real sum = 0;
-      for (int t = 0; t < context && (start + j + t) < end; ++t) {
-        int xoff = off + t * width;
-        real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
-        real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
-        sum += wd * xd;
-      }
-      if (gidx + tidx < width) {
-        y[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-template <>
-void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,  // NOLINT
-                              const GpuMatrix& in,
-                              const GpuMatrix& filter,
-                              const GpuIVector& seq) {
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  const size_t height = in.getHeight();
-  const size_t width = in.getWidth();
-
-  real* y = out.getData();
-  const real* x = in.getData();
-  const real* w = filter.getData();
-  const int* starts = seq.getData();
-
-  dim3 dimBlock(32, 32);
-  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
-
-  if (contextLength <= 32) {
-    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-        y, x, w, starts, height, width, numSeq, contextLength);
-  } else {
-    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-        y, x, w, starts, height, width, numSeq, contextLength);
-  }
-  CHECK_SYNC("RowConv");
-}
-
-template <int BLOCK_H, int BLOCK_W, int CONTEXT>
-__global__ void KeRowConvBwWeight(real* dw,
-                                  const real* x,
-                                  const real* dy,
-                                  const int* starts,
-                                  const int height,
-                                  const int width,
-                                  const int numSeq,
-                                  const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sh_x[BLOCK_W][BLOCK_H];
-  __shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1];
-  __shared__ real sh_dw[CONTEXT][BLOCK_W];
-
-  if (tidy < context) {
-    sh_dw[tidy][tidx] = 0.0;
-  }
-  __syncthreads();
-
-  // NOTE(zcd): temporary solution
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, true);
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
-    for (int j = tidy; j < size; j += BLOCK_H) {
-      int xoff = gidx + tidx;
-      int yoff = start + j;
-
-      // transpose
-      sh_x[tidx][tidy] =
-          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      sh_dy[tidx][tidy + context - 1] =
-          (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
-      __syncthreads();
-      if (tidy < (context - 1)) {
-        yoff = yoff - context + 1;
-        sh_dy[tidx][tidy] =
-            (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
-      }
-      __syncthreads();
-
-      for (int t = 0; t < context; t++) {
-        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
-        __syncthreads();
-        // warp size and blockDim.x is 32.
-
-        for (int offset = 16; offset > 0; offset /= 2)
-          val += __shfl_down_sync(mask, val, offset);
-
-        __syncthreads();
-        if (tidx == 0) {
-          sh_dw[t][tidy] += val;
-        }
-        __syncthreads();
-      }
-    }
-  }
-
-  for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) {
-    dw[t * width + gidx + tidx] += sh_dw[t][tidx];
-  }
-}
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConvBwWeight2(real* dw,
-                                   const real* x,
-                                   const real* dy,
-                                   const int* starts,
-                                   const int height,
-                                   const int width,
-                                   const int numSeq,
-                                   const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sh_x[BLOCK_H][BLOCK_W];
-  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
-
-  // NOTE(zcd): temporary solution
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, true);
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-
-    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
-    for (int j = tidy; j < size; j += BLOCK_H) {
-      int xoff = gidx + tidx;
-      int yoff = start + j;
-
-      // transpose
-      sh_x[tidx][tidy] =
-          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      __syncthreads();
-
-      for (int t = 0; t < context; t++) {
-        sh_dy[tidx][tidy] =
-            (xoff < width && (yoff - t) >= start && yoff - t < end)
-                ? dy[(yoff - t) * width + xoff]
-                : 0.0;
-        __syncthreads();
-
-        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
-        __syncthreads();
-        // warp size and blockDim.x is 32.
-        for (int offset = 16; offset > 0; offset /= 2)
-          val += __shfl_down_sync(mask, val, offset);
-
-        __syncthreads();
-
-        if (tidx == 0 && (gidx + tidy) < width) {
-          dw[t * width + gidx + tidy] += val;
-        }
-      }
-    }
-  }
-}
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConvBwData(real* dx,
-                                const real* w,
-                                const real* dy,
-                                const int* starts,
-                                const int height,
-                                const int width,
-                                const int numSeq,
-                                const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sw[BLOCK_H][BLOCK_W];
-
-  for (int i = tidy; i < context; i += blky) {
-    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
-  }
-
-  __syncthreads();
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context && (j - t) >= 0; ++t) {
-        int dyOff = off - t * width;
-        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
-        sum += sw[t][tidx] * dyVal;
-      }
-      if (gidx + tidx < width) {
-        dx[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-__global__ void KeRowConvBwData2(real* dx,
-                                 const real* w,
-                                 const real* dy,
-                                 const int* starts,
-                                 const int height,
-                                 const int width,
-                                 const int numSeq,
-                                 const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context && (j - t) >= 0; ++t) {
-        int dyOff = off - t * width;
-        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
-        real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
-        sum += wVal * dyVal;
-      }
-      if (gidx + tidx < width) {
-        dx[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-template <>
-void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
-                                  const GpuMatrix& in,
-                                  const GpuMatrix& filter,
-                                  GpuMatrix& inG,      // NOLINT
-                                  GpuMatrix& filterG,  // NOLINT
-                                  const GpuIVector& seq) {
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  const size_t height = in.getHeight();
-  const size_t width = in.getWidth();
-
-  const real* dy = outG.getData();
-  const real* x = in.getData();
-  const real* w = filter.getData();
-  const int* starts = seq.getData();
-
-  if (filterG) {
-    dim3 dimBlock(32, 32);
-    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
-    real* dw = filterG.getData();
-    if (contextLength <= 32) {
-      KeRowConvBwWeight<32, 32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-          dw, x, dy, starts, height, width, numSeq, contextLength);
-    } else {
-      KeRowConvBwWeight2<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-          dw, x, dy, starts, height, width, numSeq, contextLength);
-    }
-  }
-
-  if (inG) {
-    real* dx = inG.getData();
-    dim3 dimBlock2(32, 32);
-    dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
-    if (contextLength <= 64) {
-      KeRowConvBwData<32, 64><<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
-          dx, w, dy, starts, height, width, numSeq, contextLength);
-    } else {
-      KeRowConvBwData2<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
-          dx, w, dy, starts, height, width, numSeq, contextLength);
-    }
-  }
-
-  CHECK_SYNC("RowConvGrad");
-}
-
-}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOp.cpp b/paddle/function/ScaleSubRegionOp.cpp
deleted file mode 100644
index 9a06ef2a96f25b5b7326049df2a708637f319561..0000000000000000000000000000000000000000
--- a/paddle/function/ScaleSubRegionOp.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ScaleSubRegionOp.h"
-#include "paddle/function/TensorShape.h"
-
-namespace paddle {
-
-template <>
-void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
-                                     const real* inputs,
-                                     const real* indices,
-                                     const TensorShape shape,
-                                     const FuncConfig& conf) {
-  real value = conf.get<real>("value");
-
-  int number = shape[0];
-  int channel = shape[1];
-  int height = shape[2];
-  int width = shape[3];
-
-  memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
-
-  for (int n = 0; n < number; ++n) {
-    // indices start from 1
-    int offset = n * 6;
-    for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
-      for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
-        for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
-          int idx = ((n * channel + c) * height + h) * width + w;
-          outputs[idx] *= value;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
-                                         real* outGrad,
-                                         const real* indices,
-                                         const TensorShape shape,
-                                         const FuncConfig& conf) {
-  real value = conf.get<real>("value");
-
-  int number = shape[0];
-  int channel = shape[1];
-  int height = shape[2];
-  int width = shape[3];
-
-  for (int n = 0; n < number; ++n) {
-    for (int c = 0; c < channel; ++c) {
-      for (int h = 0; h < height; ++h) {
-        for (int w = 0; w < width; ++w) {
-          int idx = ((n * channel + c) * height + h) * width + w;
-          int offset = n * 6;
-          if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
-              h >= (indices[offset + 2] - 1) &&
-              h <= (indices[offset + 3] - 1) &&
-              w >= (indices[offset + 4] - 1) &&
-              w <= (indices[offset + 5] - 1)) {
-            outGrad[idx] += inGrad[idx] * value;
-          } else {
-            outGrad[idx] += inGrad[idx];
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief For each instance, ScaleSubRegion can be used to multiply a value to
- *        a specified sub continuous region. By providing start index and end
- *        index for C/H/W, you can specify the location and shape of the region.
- *
- * Argument in this Function:
- * \param inputs    A 4-D tensor with shape [N, C, H, W], only one input.
- * \param indices   A 2-D tensor with shape [N, 6], indicates the sub region.
- * \param outputs   A 4-D tensor with same shape as inputs, output value.
- */
-template <DeviceType Device>
-class ScaleSubRegionFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    TensorShape shape = inputs[0].shape();
-
-    ScaleSubRegion<Device>(outputs[0].data<real>(),
-                           inputs[0].data<real>(),
-                           inputs[1].data<real>(),
-                           shape,
-                           conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-/**
- * \brief The backward propagation of ScaleSubRegion Function.
- *
- * Argument in this Function:
- * \param inputs  A 4-D tensor with shape [N, C, H, W], output gradient.
- * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
- * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
- */
-
-template <DeviceType Device>
-class ScaleSubRegionGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    TensorShape shape = inputs[0].shape();
-
-    ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
-                               outputs[0].data<real>(),
-                               inputs[1].data<real>(),
-                               shape,
-                               conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
-REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
-REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/SwitchOp.cpp b/paddle/function/SwitchOp.cpp
deleted file mode 100644
index 750fb6bf28baf050b1f9f965a1a9b315363e5645..0000000000000000000000000000000000000000
--- a/paddle/function/SwitchOp.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SwitchOp.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void NCHW2NHWC<DEVICE_TYPE_CPU>(real* outputs,
-                                const real* inputs,
-                                const int num,
-                                const int inC,
-                                const int inH,
-                                const int inW,
-                                const int argType) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < inC; ++c) {
-      for (int h = 0; h < inH; ++h) {
-        for (int w = 0; w < inW; ++w) {
-          if (argType == ADD_TO) {
-            outputs[((n * inH + h) * inW + w) * inC + c] += *(inputs++);
-          } else {
-            outputs[((n * inH + h) * inW + w) * inC + c] = *(inputs++);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <>
-void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
-                                const real* inputs,
-                                const int num,
-                                const int inH,
-                                const int inW,
-                                const int inC,
-                                const int argType) {
-  for (int n = 0; n < num; ++n) {
-    for (int h = 0; h < inH; ++h) {
-      for (int w = 0; w < inW; ++w) {
-        for (int c = 0; c < inC; ++c) {
-          if (argType == ADD_TO) {
-            outputs[((n * inC + c) * inH + h) * inW + w] += *(inputs++);
-          } else {
-            outputs[((n * inC + c) * inH + h) * inW + w] = *(inputs++);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief  Switch dimension order of image input.
- *         The input and output is a 4D tensor. Switch order
- *         'batch_size,channels, height, width' to
- *         order 'batch_size, height, width, channels'.
- *
- * Argument in this Function:
- * \param inputs  input data with order 'batch_size,channels, height, width'.
- * \param outputs output data with order 'batch_size, height, width, channels'.
- */
-template <DeviceType Device>
-class NCHW2NHWCFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = inputs[0].shape()[0];
-    size_t inC = inputs[0].shape()[1];
-    size_t inH = inputs[0].shape()[2];
-    size_t inW = inputs[0].shape()[3];
-    NCHW2NHWC<Device>(outputs[0].data<real>(),
-                      inputs[0].data<real>(),
-                      num,
-                      inC,
-                      inH,
-                      inW,
-                      outputs[0].getArgType());
-  }
-};
-
-/**
- * \brief  Switch dimension order of image input.
- *         The input and output is a 4D tensor. Switch order
- *         'batch_size, height, width, channels' to
- *         order 'batch_size, channels, height, width'.
- *
- * Argument in this Function:
- * \param inputs  input data with order 'batch_size, height, width, channels'.
- * \param outputs output data with order 'batch_size, channels, height, width'.
- */
-template <DeviceType Device>
-class NHWC2NCHWFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = inputs[0].shape()[0];
-    size_t inH = inputs[0].shape()[1];
-    size_t inW = inputs[0].shape()[2];
-    size_t inC = inputs[0].shape()[3];
-
-    NHWC2NCHW<Device>(outputs[0].data<real>(),
-                      inputs[0].data<real>(),
-                      num,
-                      inH,
-                      inW,
-                      inC,
-                      outputs[0].getArgType());
-  }
-};
-
-REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
-REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
-REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/TensorType.h b/paddle/function/TensorType.h
deleted file mode 100644
index b384591bd8852bbdc61bf9aa678ce613732c369a..0000000000000000000000000000000000000000
--- a/paddle/function/TensorType.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-enum ValueType {
-  VALUE_TYPE_INT32 = 0,
-  VALUE_TYPE_FLOAT = 1,
-  VALUE_TYPE_DOUBLE = 2,
-  VALUE_TYPE_BYTE = 3
-};
-
-enum DeviceType {
-  DEVICE_TYPE_UNSPECIFIED = 0,
-  DEVICE_TYPE_CPU = 1,
-  DEVICE_TYPE_GPU = 2
-};
-
-enum SparseDataType { T_NO_VALUE = 0, T_FLOAT_VALUE = 1 };
-
-enum SparseDataFormat { T_SPARSE_CSR = 0, T_SPARSE_CSC = 1 };
-
-inline int sizeOfValuType(ValueType valueType) {
-  if (valueType == VALUE_TYPE_INT32) {
-    return 4;
-  } else if (valueType == VALUE_TYPE_FLOAT) {
-    return 4;
-  } else if (valueType == VALUE_TYPE_DOUBLE) {
-    return 8;
-  } else {
-    LOG(FATAL) << "Unknown type: " << valueType;
-    return 0;
-  }
-}
-
-template <typename T>
-struct DataType;
-
-template <>
-struct DataType<float> {
-  static const ValueType value = VALUE_TYPE_FLOAT;
-};
-
-template <>
-struct DataType<double> {
-  static const ValueType value = VALUE_TYPE_DOUBLE;
-};
-
-template <>
-struct DataType<int> {
-  static const ValueType value = VALUE_TYPE_INT32;
-};
-
-namespace detail {
-
-template <typename VType, DeviceType Device>
-struct MatrixT;
-
-template <>
-struct MatrixT<real, DEVICE_TYPE_CPU> {
-  using type = CpuMatrix;
-};
-
-template <>
-struct MatrixT<real, DEVICE_TYPE_GPU> {
-  using type = GpuMatrix;
-};
-
-template <>
-struct MatrixT<int, DEVICE_TYPE_CPU> {
-  using type = void;  // Not implemented
-};
-
-template <>
-struct MatrixT<int, DEVICE_TYPE_GPU> {
-  using type = void;  // Not implemented
-};
-
-template <typename VType, DeviceType Device>
-struct SparseMatrixT;
-
-template <>
-struct SparseMatrixT<real, DEVICE_TYPE_CPU> {
-  using type = CpuSparseMatrix;
-};
-
-template <>
-struct SparseMatrixT<real, DEVICE_TYPE_GPU> {
-  using type = GpuSparseMatrix;
-};
-
-template <>
-struct SparseMatrixT<int, DEVICE_TYPE_CPU> {
-  using type = void;  // Not implemented
-};
-
-template <>
-struct SparseMatrixT<int, DEVICE_TYPE_GPU> {
-  using type = void;  // Not implemented
-};
-
-template <typename VType, DeviceType Device>
-struct VectorT;
-
-template <>
-struct VectorT<real, DEVICE_TYPE_CPU> {
-  using type = CpuVector;
-};
-
-template <>
-struct VectorT<real, DEVICE_TYPE_GPU> {
-  using type = GpuVector;
-};
-
-template <>
-struct VectorT<int, DEVICE_TYPE_CPU> {
-  using type = CpuIVector;
-};
-
-template <>
-struct VectorT<int, DEVICE_TYPE_GPU> {
-  using type = GpuIVector;
-};
-
-}  // namespace detail
-
-template <typename VType, DeviceType DType>
-struct Tensor {
-  typedef typename detail::VectorT<VType, DType>::type Vector;
-  typedef typename detail::MatrixT<VType, DType>::type Matrix;
-  typedef typename detail::SparseMatrixT<VType, DType>::type SparseMatrix;
-};
-
-}  // namespace paddle
diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp
deleted file mode 100644
index d7ac83da41aaba5cd38b042d0381dea527f9c42d..0000000000000000000000000000000000000000
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NeonDepthwiseConv.h"
-#include "paddle/function/ConvOp.h"
-
-namespace paddle {
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-template <DeviceType Device>
-class NeonDepthwiseConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    int batchSize = input[0];
-    int inputChannels = input[1];
-    int inputHeight = input[2];
-    int inputWidth = input[3];
-    int filterHeight = getFilterHeight(filter);
-    int filterWidth = getFilterWidth(filter);
-    int outputChannels = output[1];
-    int outputHeight = output[2];
-    int outputWidth = output[3];
-    int filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(static_cast<size_t>(inputChannels), groups_);
-
-    // only support strideH() == strideW() and filterHeight == filterWidth.
-    CHECK_EQ(strideH(), strideW());
-    CHECK_EQ(filterHeight, filterWidth);
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    // padding the input
-    float* inputPadding = inputData;
-    int padInputHeight = inputHeight + 2 * paddingH();
-    int padInputWidth = inputWidth + 2 * paddingW();
-    int newSize =
-        batchSize * (inputChannels + 1) * padInputHeight * padInputWidth;
-
-    resizeBuffer<Device>(newSize);
-    inputPadding = reinterpret_cast<float*>(memory_->getBuf());
-    neon::Padding<float>::run(inputData,
-                              inputPadding,
-                              batchSize * inputChannels,
-                              inputHeight,
-                              inputWidth,
-                              padInputHeight,
-                              padInputWidth);
-
-    std::function<void(
-        const float*, const float*, int, int, int, int, int, int, float*)>
-        DepthWiseConv;
-
-    if (filterWidth == 3 && strideW() == 1) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
-    } else if (filterWidth == 3 && strideW() == 2) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 2>::run;
-    } else if (filterWidth == 4 && strideW() == 1) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
-    } else if (filterWidth == 4 && strideW() == 2) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 2>::run;
-    } else {
-      LOG(FATAL) << "Not supported";
-    }
-
-    for (int i = 0; i < batchSize; i++) {
-      DepthWiseConv(inputPadding,
-                    filterData,
-                    padInputHeight,
-                    padInputWidth,
-                    outputChannels,
-                    outputHeight,
-                    outputWidth,
-                    filterMultiplier,
-                    outputData);
-      inputPadding += inputChannels * padInputHeight * padInputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifndef PADDLE_TYPE_DOUBLE
-REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction);
-#endif
-
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
deleted file mode 100644
index 1fc5daf6078bbd5b4506ff2e0832e2cc3ec48fe3..0000000000000000000000000000000000000000
--- a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NeonDepthwiseConv.h"
-#include "paddle/function/ConvOp.h"
-
-namespace paddle {
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-template <DeviceType Device>
-class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    int batchSize = input[0];
-    int inputChannels = input[1];
-    int inputHeight = input[2];
-    int inputWidth = input[3];
-    int filterHeight = getFilterHeight(filter);
-    int filterWidth = getFilterWidth(filter);
-    int outputChannels = output[1];
-    int outputHeight = output[2];
-    int outputWidth = output[3];
-    int filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
-
-    // only support strideH() == strideW() and filterHeight == filterWidth.
-    CHECK_EQ(strideH(), strideW());
-    CHECK_EQ(paddingH(), paddingW());
-    CHECK_EQ(filterHeight, filterWidth);
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    // padding the input, input -> inputPadding
-    float* inputPadding = inputData;
-    int padInputHeight =
-        (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
-    int padInputWidth =
-        (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
-
-    if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
-      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
-      resizeBuffer<Device>(newSize);
-      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
-      if (strideH() == 1) {
-        neon::Padding<float>::run(inputData,
-                                  inputPadding,
-                                  batchSize * inputChannels,
-                                  inputHeight,
-                                  inputWidth,
-                                  padInputHeight,
-                                  padInputWidth);
-      } else if (strideH() == 2) {
-        neon::StridePadding::run(inputData,
-                                 inputPadding,
-                                 batchSize * inputChannels,
-                                 inputHeight,
-                                 inputWidth,
-                                 padInputHeight,
-                                 padInputWidth);
-      } else {
-        LOG(FATAL) << "Not supported";
-      }
-    }
-
-    std::function<void(
-        const float*, const float*, int, int, int, int, int, int, float*)>
-        DepthWiseConv;
-
-    if (filterWidth == 3) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
-    } else if (filterWidth == 4) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
-    } else {
-      LOG(FATAL) << "Not supported";
-    }
-
-    for (int i = 0; i < batchSize; i++) {
-      DepthWiseConv(inputPadding,
-                    filterData,
-                    padInputHeight,
-                    padInputWidth,
-                    outputChannels,
-                    outputHeight,
-                    outputWidth,
-                    filterMultiplier,
-                    outputData);
-      inputPadding += inputChannels * padInputHeight * padInputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifndef PADDLE_TYPE_DOUBLE
-
-REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
-                    CPU,
-                    NeonDepthwiseConvTransposeFunction);
-
-#endif
-
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
deleted file mode 100644
index 48c997b50d8c73b25c58801c30e597c9d1f3232a..0000000000000000000000000000000000000000
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "nnpack.h"
-#include "paddle/function/ConvOp.h"
-
-DEFINE_bool(nnpack_allocate_outside,
-            true,
-            "Allocate and free workspace memory outside the NNPACK interface.");
-DEFINE_int32(nnpack_num_threads,
-             0,
-             "The number of nnpack threads"
-             "default: 0; 0 to disable threadpool.");
-
-namespace paddle {
-
-nnp_convolution_algorithm get_nnp_convolution_algorithm(
-    const std::string& algorithm) {
-  if (algorithm == "auto") {
-    return nnp_convolution_algorithm_auto;
-  } else if (algorithm == "ft8x8") {
-    return nnp_convolution_algorithm_ft8x8;
-  } else if (algorithm == "ft16x16") {
-    return nnp_convolution_algorithm_ft16x16;
-  } else if (algorithm == "wt8x8") {
-    return nnp_convolution_algorithm_wt8x8;
-  } else if (algorithm == "implicit-gemm") {
-    return nnp_convolution_algorithm_implicit_gemm;
-  } else if (algorithm == "direct") {
-    return nnp_convolution_algorithm_direct;
-  } else {
-    return nnp_convolution_algorithm_auto;
-  }
-}
-
-template <DeviceType Device>
-class NNPACKConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-    algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
-    transform_strategy_ = nnp_convolution_transform_strategy_compute;
-    nnp_status status = nnp_initialize();
-    CHECK_EQ(status, nnp_status_success);
-    workspaceBuffer_ = nullptr;
-    workspaceSize_ = 0;
-
-    create_nnpack_threadpool();
-  }
-
-  ~NNPACKConvFunction() {
-    if (workspaceBuffer_) {
-      free(workspaceBuffer_);
-    }
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    check(inputs, outputs);
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
-    nnp_padding padding = {.top = (size_t)paddingH(),
-                           .right = (size_t)paddingW(),
-                           .bottom = (size_t)paddingH(),
-                           .left = (size_t)paddingW()};
-    nnp_size kernelSize = {.width = filterWidth, .height = filterHeight};
-    nnp_size outputSubsampling = {.width = (size_t)strideW(),
-                                  .height = (size_t)strideH()};
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    void* bufferPtr = nullptr;
-    size_t* sizePtr = nullptr;
-    size_t needSize;
-    if (FLAGS_nnpack_allocate_outside) {
-      if (batchSize == 1) {
-        nnp_status status = nnp_convolution_inference(algorithm_,
-                                                      transform_strategy_,
-                                                      inputChannels,
-                                                      outputChannels,
-                                                      inputSize,
-                                                      padding,
-                                                      kernelSize,
-                                                      outputSubsampling,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      &needSize,
-                                                      nnp_activation_identity,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      } else {
-        // only supports stride = 1
-        CHECK_EQ(strideH(), 1);
-        CHECK_EQ(strideW(), 1);
-        nnp_status status = nnp_convolution_output(algorithm_,
-                                                   batchSize,
-                                                   inputChannels,
-                                                   outputChannels,
-                                                   inputSize,
-                                                   padding,
-                                                   kernelSize,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   &needSize,
-                                                   nnp_activation_identity,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      }
-
-      VLOG(3) << "workspace size is " << needSize;
-      if (needSize > workspaceSize_) {
-        workspaceSize_ = needSize;
-        if (workspaceBuffer_) {
-          free(workspaceBuffer_);
-        } else {
-          posix_memalign(&workspaceBuffer_, 64, needSize);
-        }
-      }
-
-      if (needSize) {
-        bufferPtr = workspaceBuffer_;
-        sizePtr = &needSize;
-      }
-    }
-
-    size_t inputOffset = inputChannels / groups_ * inputHeight * inputWidth;
-    size_t outputOffset = outputChannels / groups_ * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    if (batchSize == 1) {
-      for (size_t g = 0; g < groups_; g++) {
-        nnp_status status =
-            nnp_convolution_inference(algorithm_,
-                                      transform_strategy_,
-                                      inputChannels / groups_,
-                                      outputChannels / groups_,
-                                      inputSize,
-                                      padding,
-                                      kernelSize,
-                                      outputSubsampling,
-                                      inputData + inputOffset * g,
-                                      filterData + filterOffset * g,
-                                      nullptr, /* bias */
-                                      outputData + outputOffset * g,
-                                      bufferPtr,
-                                      sizePtr,
-                                      nnp_activation_identity,
-                                      nullptr,
-                                      threadpool_, /* threadpool */
-                                      nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      }
-    } else {
-      // only supports stride = 1
-      CHECK_EQ(strideH(), 1);
-      CHECK_EQ(strideW(), 1);
-
-      // TODO(hedaoyuan): There has some bug when batchSize > 1 and groups_ > 1.
-      CHECK_EQ(groups_, static_cast<size_t>(1));
-      nnp_status status = nnp_convolution_output(algorithm_,
-                                                 batchSize,
-                                                 inputChannels,
-                                                 outputChannels,
-                                                 inputSize,
-                                                 padding,
-                                                 kernelSize,
-                                                 inputData,
-                                                 filterData,
-                                                 nullptr, /* bias */
-                                                 outputData,
-                                                 bufferPtr,
-                                                 sizePtr,
-                                                 nnp_activation_identity,
-                                                 nullptr,
-                                                 threadpool_, /* threadpool */
-                                                 nullptr);
-      CHECK_EQ(status, nnp_status_success);
-    }
-  }
-
-  static void create_nnpack_threadpool() {
-    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
-      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
-      VLOG(3) << "Number of threads "
-              << pthreadpool_get_threads_count(threadpool_);
-    }
-  }
-
- private:
-  nnp_convolution_algorithm algorithm_;
-  nnp_convolution_transform_strategy transform_strategy_;
-  void* workspaceBuffer_;
-  size_t workspaceSize_;
-  static pthreadpool_t threadpool_;
-};
-
-template <DeviceType Device>
-pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
-
-REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
-
-}  // namespace paddle
diff --git a/paddle/function/nnpack/NNPACKConvOpTest.cpp b/paddle/function/nnpack/NNPACKConvOpTest.cpp
deleted file mode 100644
index c80ffb5d5d255465e9a2fa251fb9a6c61f96e7ec..0000000000000000000000000000000000000000
--- a/paddle/function/nnpack/NNPACKConvOpTest.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/function/ConvOpTest.h"
-
-namespace paddle {
-
-TEST(NNPACK, Forward) {
-  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "GemmConv-CPU", "NNPACKConv-CPU", forward);
-}
-
-TEST(NNPACK, Depthwise) {
-  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "GemmConv-CPU", "NNPACKConv-CPU", forward);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
deleted file mode 100644
index 71c238fbfe9f32f3764601ebb441336931f8ef5f..0000000000000000000000000000000000000000
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ /dev/null
@@ -1,509 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ActivationFunction.h"
-
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <thread>
-#include <type_traits>
-#include "paddle/parameter/Argument.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Logging.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "MKLDNNActivation.h"
-#endif
-
-namespace paddle {
-
-static ClassRegistrar<ActivationFunction> gActivationRegistrar;
-/**
- * @def ACTIVATION_CLASS_NAME
- * @brief Macro for getting derived activation class name
- * @note ACTIVATION_CLASS_NAME(softmax) softmax_;
- * means softmaxActivation softmax_;
- */
-#define ACTIVATION_CLASS_NAME(ACTIVATION_NAME) ACTIVATION_NAME##Activation
-/**
- * @def BEGIN_DEFINE_ACTIVATION
- * @brief Macro for defining a devried activation class
- */
-#define BEGIN_DEFINE_ACTIVATION(ACTIVATION_NAME)                             \
-  class ACTIVATION_CLASS_NAME(ACTIVATION_NAME) : public ActivationFunction { \
-   private:                                                                  \
-    static const std::string name;                                           \
-                                                                             \
-   public:                                                                   \
-    const std::string& getName() const { return name; }
-/**
- * @def END_DEFINE_ACTIVATION
- * @brief Macro for registering a derived activation class
- */
-#define END_DEFINE_ACTIVATION(ACTIVATION_NAME)                     \
-  }                                                                \
-  ;                                                                \
-  const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
-      #ACTIVATION_NAME;                                            \
-  static InitFunction __reg_activation__##ACTIVATION_NAME([] {     \
-    gActivationRegistrar                                           \
-        .registerClass<ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(    \
-            #ACTIVATION_NAME);                                     \
-  });
-
-/**
- * @brief The IdentityActivation class
- *
- * Do nothing when forward/backward.
- */
-class IdentityActivation : public ActivationFunction {
- public:
-  static const std::string name;
-  Error __must_check forward(Argument& act) {
-    (void)act;
-    return Error();
-  }
-  Error __must_check backward(Argument& act) {
-    (void)act;
-    return Error();
-  }
-  const std::string& getName() const { return name; }
-};
-const std::string IdentityActivation::name = "";
-static InitFunction __reg_activation__identity([] {
-  gActivationRegistrar.registerClass<IdentityActivation>("");
-  gActivationRegistrar.registerClass<IdentityActivation>("linear");
-});
-
-/**
- * @brief Sigmoid Activation
- * \f[
- * f(z) = \frac{1}{1+exp(-z)}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(sigmoid)
-Error __must_check forward(Argument& act) {
-  act.value->sigmoid(*act.value);
-  return Error();
-}
-Error __must_check backward(Argument& act) {
-  act.grad->sigmoidDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(sigmoid)
-
-/**
- * @brief Softmax Activation
- * \f[
- * P(y=j|x) = \frac{e^{x^Tw_j}}{\sum^K_{k=1}e^{x^Tw_k}}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softmax)
-private:
-MatrixPtr sftMaxSum_;
-MatrixPtr sftMaxDot_;
-
-public:
-Error __must_check forward(Argument& act) {
-  act.value->softmax(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  MatrixPtr outputV = act.value;
-  MatrixPtr outputG = act.grad;
-
-  if (outputG->useGpu()) {
-    outputG->softmaxBackward(*outputV);
-  } else {
-    SetDevice device(act.deviceId);
-    Matrix::resizeOrCreate(sftMaxDot_,
-                           outputG->getHeight(),
-                           outputG->getWidth(),
-                           /* trans */ false,
-                           useGpu(act.deviceId));
-    Matrix::resizeOrCreate(sftMaxSum_,
-                           outputG->getHeight(),
-                           1,
-                           /* trans */ false,
-                           useGpu(act.deviceId));
-
-    sftMaxDot_->dotMul(*outputG, *outputV);
-    sftMaxSum_->colMerge(*sftMaxDot_);
-
-    act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
-  }
-  return Error();
-}
-END_DEFINE_ACTIVATION(softmax)
-
-/**
- * @brief Sequence_softmax Activation
- * @note Softmax on all frames of one sequence.
- * Width of frame must be one.
- */
-BEGIN_DEFINE_ACTIVATION(sequence_softmax)
-private:
-ACTIVATION_CLASS_NAME(softmax) softmax_;
-Argument argument_;
-
-public:
-Error __must_check forward(Argument& act) {
-  if (act.value->getWidth() != 1UL) {
-    return Error(
-        "Input width for each timestep of sequence softmax should be 1");
-  }
-
-  if (!argument_.value) {
-    argument_.value = Matrix::create(nullptr,
-                                     /* height= */ 1,
-                                     1,
-                                     /* trans= */ false,
-                                     useGpu(act.deviceId));
-    argument_.grad = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    1,
-                                    /* trans= */ false,
-                                    useGpu(act.deviceId));
-  }
-
-  auto starts =
-      act.hasSubseq()
-          ? act.subSequenceStartPositions->getVector(useGpu(act.deviceId))
-          : act.sequenceStartPositions->getVector(useGpu(act.deviceId));
-  act.value->sequenceSoftmax(*act.value, *starts);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  if (act.value->getWidth() != 1UL) {
-    return Error(
-        "Input width for each timestep of sequence softmax should be 1");
-  }
-
-  size_t numSequences =
-      act.hasSubseq() ? act.getNumSubSequences() : act.getNumSequences();
-  const int* starts = act.getCpuStartPositions();
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    // TODO(Dangqingqing) optimization for GPU
-    size_t offset = starts[i];
-    size_t size = starts[i + 1] - starts[i];
-    argument_.value->setData(act.value->getData() + offset, 1UL, size);
-    argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
-
-    Error err = softmax_.backward(argument_);
-    if (!err.isOK()) return err;
-  }
-  return Error();
-}
-END_DEFINE_ACTIVATION(sequence_softmax)
-
-/*
- * @brief SoftSign Activation.
- * \f[
- * f(z) = \frac{z}{1 + |z|}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softsign)
-private:
-MatrixPtr denominator_;
-
-Error __must_check forward(Argument& act) {
-  size_t height = act.value->getHeight();
-  size_t width = act.value->getWidth();
-  Matrix::resizeOrCreate(
-      denominator_, height, width, false, useGpu(act.deviceId));
-  denominator_->assign(*act.value);
-  denominator_->abs2();
-  denominator_->add(1.);
-
-  act.value->dotDiv(*act.value, *denominator_);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  denominator_->square2();
-  denominator_->scalarDiv(*denominator_, 1.);
-  act.grad->dotMul(*act.grad, *denominator_);
-  return Error();
-}
-END_DEFINE_ACTIVATION(softsign)
-
-/**
- * @brief Relu Activation.
- * forward. y = max(0, z)
- *
- * derivative of relu is:
- *
- *    1 if z > 0
- *
- *    0 otherwise.
- */
-BEGIN_DEFINE_ACTIVATION(relu)
-Error __must_check forward(Argument& act) {
-  act.value->relu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->reluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(relu)
-
-/**
- * @brief BRelu Activation.
- *
- * forward. y = min(24, max(0, z))
- *
- * derivative of brelu is:
- *
- *    1 if 0 < z < 24
- *
- *    0 otherwise.
- *
- * TODO(yuyang18): Remove magic number 24 or make it configuable.
- */
-BEGIN_DEFINE_ACTIVATION(brelu)
-Error __must_check forward(Argument& act) {
-  act.value->brelu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->breluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(brelu)
-
-/**
- * @brief Tanh Activation.
- * \f[
- * f(z) = tanh(z)=\frac{e^z-e^{-z}}{e^z+e^{-z}}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(tanh)
-Error __must_check forward(Argument& act) {
-  act.value->tanh(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->tanhDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(tanh)
-
-/**
- * @brief Scaled Tanh Activation
- * \f[
- * f(z) = 1.7159 * tanh(2/3*z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(stanh)
-private:
-real a, b;
-
-public:
-ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
-Error __must_check forward(Argument& act) {
-  act.value->scaledTanh(*act.value, a, b);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->scaledTanhDerivative(*act.value, a, b);
-  return Error();
-}
-END_DEFINE_ACTIVATION(stanh)
-
-/**
- * @brief Soft Relu Activation.
- * \f[
- * f(z) = ln(1+e^z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softrelu)
-Error __must_check forward(Argument& act) {
-  act.value->softrelu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->softreluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(softrelu)
-
-/**
- * @brief Abs Activation.
- * Forward: f(z) = abs(z)
- *
- * Derivative:
- *
- *     1   if z>0
- *
- *    -1   if z<0
- *
- *     0   if z=0
- */
-BEGIN_DEFINE_ACTIVATION(abs)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->abs2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->absDerivative(*act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(abs)
-
-/**
- * @brief Square Activation.
- * \f[
- * f(z) = z^2.
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(square)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->square2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->squareDerivative(*act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(square)
-
-/**
- * @brief Exponential Activation.
- * \f[
- * f(z) = e^z
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(exponential)
-Error __must_check forward(Argument& act) {
-  act.value->exp2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->expDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(exponential)
-
-/**
- * @brief Reciprocal Activation.
- * \f[
- * f(z) = 1/z
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(reciprocal)
-Error __must_check forward(Argument& act) {
-  act.value->reciprocal2();
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotMulSquare(*act.value);
-  act.grad->neg();
-  return Error();
-}
-END_DEFINE_ACTIVATION(reciprocal)
-
-/**
- * @brief Square Root Activation.
- * \f[
- * f(z) = sqrt(z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(sqrt)
-Error __must_check forward(Argument& act) {
-  act.value->sqrt2();
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotDiv(*act.grad, *act.value);
-  act.grad->mulScalar(0.5);
-  return Error();
-}
-END_DEFINE_ACTIVATION(sqrt)
-
-/**
- * @brief Logarithm Activation.
- * \f[
- * f(z) = log(z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(log)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->log2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotDiv(*act.grad, *act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(log)
-
-ActivationFunction* ActivationFunction::create(const std::string& type) {
-#ifdef PADDLE_WITH_MKLDNN
-  if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
-    return MKLDNNActivation::create(type);
-  }
-#endif
-
-  return gActivationRegistrar.createByType(type);
-}
-
-std::vector<std::string> ActivationFunction::getAllRegisteredTypes() {
-  std::vector<std::string> types;
-  gActivationRegistrar.forEachType(
-      [&](const std::string& type) { types.push_back(type); });
-  return types;
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/activations/MKLDNNActivation.h b/paddle/gserver/activations/MKLDNNActivation.h
deleted file mode 100644
index eece1b9c37e72624dffd119804c65f7bd36e20fb..0000000000000000000000000000000000000000
--- a/paddle/gserver/activations/MKLDNNActivation.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "ActivationFunction.h"
-#include "mkldnn.hpp"
-#include "paddle/gserver/layers/MKLDNNBase.h"
-#include "paddle/math/MKLDNNMatrix.h"
-#include "paddle/parameter/Argument.h"
-
-namespace paddle {
-
-/**
- * @brief Base class of MKLDNN Activation.
- * Common activation function are provieded,
- * including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
- */
-class MKLDNNActivation : public ActivationFunction {
- protected:
-  // input value element count
-  size_t cnt_;
-  // should not merge the resetBwd into resetFwd,
-  // because the grad data would be changing before backward.
-  bool needResetBwd_;
-  // mkldnn matrix, primitive, stream and pipeline
-  MKLDNNMatrixPtr val_;
-  MKLDNNMatrixPtr grad_;
-  std::shared_ptr<mkldnn::engine> engine_;
-  std::shared_ptr<MKLDNNStream> stream_;
-  std::shared_ptr<mkldnn::primitive> fwd_;
-  std::shared_ptr<mkldnn::primitive> bwd_;
-  std::vector<mkldnn::primitive> pipelineFwd_;
-  std::vector<mkldnn::primitive> pipelineBwd_;
-
- public:
-  MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
-  ~MKLDNNActivation() {}
-  static ActivationFunction* create(const std::string& type);
-  static std::vector<std::string> getAllRegisteredTypes();
-  virtual const std::string& getName() const = 0;
-  /**
-   * reset the forward primitives
-   */
-  virtual void resetFwd(Argument& act);
-  /**
-   * reset the backward primitives,
-   * can not merge this functions into resetFwd as the grad data
-   * would be changing before backward.
-   */
-  virtual void resetBwd(Argument& act) {}
-  virtual Error __must_check forward(Argument& act);
-  virtual Error __must_check backward(Argument& act);
-};
-
-/**
- * @brief Base class of MKLDNN Eltwise Activation,
- * includes mkldnn_relu, mkldnn_elu and mkldnn_tanh.
- */
-class MKLDNNEltwiseActivation : public MKLDNNActivation {
-  typedef mkldnn::eltwise_forward eltwise_fwd;
-  typedef mkldnn::eltwise_backward eltwise_bwd;
-  typedef mkldnn::algorithm algorithm;
-
- protected:
-  // save the forward primitive desc, which can be used backward
-  std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
-  // eltwise_bwd need src input value
-  MKLDNNMatrixPtr inVal_;
-  // use for copy data
-  std::shared_ptr<mkldnn::reorder> copyInVal_;
-
- public:
-  MKLDNNEltwiseActivation() {}
-  ~MKLDNNEltwiseActivation() {}
-  virtual const std::string& getName() const = 0;
-
-  // in common, the alpha of forward and backward should be equal.
-  // but for relu, to avoid negative value, they should be opposite
-  virtual float getAlpha() const = 0;
-  virtual float getBwdAlpha() const = 0;
-  virtual float getBeta() const { return 0.f; }
-  virtual algorithm getAlgo(std::string type) const;
-  void resetFwd(Argument& act) override;
-  void resetBwd(Argument& act) override;
-};
-
-/**
- * @brief Base class of MKLDNN softmax Activation,
- * only have mkldnn forward, use cpu implement for backward.
- */
-class MKLDNNSoftmaxActivation : public MKLDNNActivation {
-  typedef mkldnn::softmax_forward softmax_fwd;
-
- private:
-  // for backward
-  MatrixPtr sftMaxSum_;
-  MatrixPtr sftMaxDot_;
-
- public:
-  MKLDNNSoftmaxActivation() {}
-  ~MKLDNNSoftmaxActivation() {}
-  virtual const std::string& getName() const = 0;
-  void resetFwd(Argument& act) override;
-  Error __must_check forward(Argument& act) override;
-  Error __must_check backward(Argument& act) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
deleted file mode 100644
index 21822b10c2ebf1d353195794cf8f49e02b64c177..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ /dev/null
@@ -1,480 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <vector>
-
-#include "DataConfig.pb.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Argument.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-/**
- * @def REGISTER_DATA_PROVIDER
- * @brief Macro for registering a data provider. The class type should contain
- *        a consturctor with parameter (DataConfig, bool).
- */
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                    \
-    DataProvider::registrar_.registerClass(                              \
-        #__type_name,                                                    \
-        [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
-          DataProvider* dp = new __class_name(conf, useGpu);             \
-          return dp;                                                     \
-        });                                                              \
-  })
-
-/**
- * @def REGISTER_DATA_PROVIDER_EX
- * @brief Macro for registering a data provider, which contains a constructor
- *        with parameter (DataConfig, ModelConfig, bool).
- */
-#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
-  static InitFunction __reg_type_##__type_name([] {                     \
-    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-class DataBatch;
-class BufferBatch;
-typedef std::shared_ptr<DataBatch> DataBatchPtr;
-typedef std::shared_ptr<BufferBatch> BufferBatchPtr;
-/**
- * @brief Data for batch training a neural network
- */
-class DataBatch {
- public:
-  DataBatch() : size_(0) { data_.clear(); }
-  /**
-   * @brief Get batch size
-   * @return batch size
-   */
-  int64_t getSize() const { return size_; }
-  /**
-   * @brief Get num of sequences of sequence data
-   * @return num of sequences
-   */
-  int64_t getNumSequences() const {
-    if (data_.empty()) return size_;
-    return data_[0].sequenceStartPositions
-               ? data_[0].sequenceStartPositions->getSize() - 1
-               : size_;
-  }
-  /**
-   * @brief Set batch size
-   * @param[in] size size
-   */
-  void setSize(int64_t size) { size_ = size; }
-  /**
-   * @brief Get size of argument vector
-   * @return size of argument vector
-   * @note For usual supervised learning, input data and label is needed,
-   * then there will be two argument.
-   */
-  int64_t getNumStreams() const { return data_.size(); }
-
-  /**
-   * @brief Get a argument with index i
-   * @param[in] i index in argument vector
-   * @return a argument with index i
-   */
-  const Argument& getStream(int i) const { return data_[i]; }
-  /**
-   * @brief Get all argument
-   * @return an argument vector
-   */
-  std::vector<Argument>& getStreams() { return data_; }
-  /**
-   * @brief Get all argument const
-   * @return an argument vector
-   */
-  std::vector<Argument> getStreams() const { return data_; }
-  /**
-   * @brief Clear DataBatch
-   */
-  void clear() {
-    data_.clear();
-    size_ = 0;
-  }
-
-  /**
-   * @brief Append data to DataBatch
-   * @param[in] data  matrix data
-   * @note The order in which each data stream is appended must match the order
-   * specified in stream_names of DataConfig. The stream_names can be obtained
-   * using DataProvider::getStreamNames().
-   */
-  void appendData(MatrixPtr data) {
-    Argument argu;
-    argu.value = data;
-    data_.push_back(argu);
-  }
-
-  /**
-   * @brief Append sequence data to DataBatch
-   * @param[in] data                      matrix data
-   * @param[in] sequenceStartPositions    sequence data
-   * @note The order in which each data stream is appended must match the order
-   * specified in stream_names of DataConfig. The stream_names can be obtained
-   * using DataProvider::getStreamNames().
-   */
-  void appendData(const MatrixPtr& data,
-                  const ICpuGpuVectorPtr& sequenceStartPositions) {
-    Argument argu;
-    argu.value = data;
-    argu.sequenceStartPositions = sequenceStartPositions;
-    data_.push_back(argu);
-  }
-  /**
-   * @brief Append label data
-   * @param[in]  label    label data
-   * @param[in]  value    matrix data, default null
-   */
-  void appendLabel(IVectorPtr label, MatrixPtr value = nullptr) {
-    Argument argu;
-    argu.ids = label;
-    argu.value = value;
-    data_.push_back(argu);
-  }
-
-  /*
-   * @brief Append argument
-   * @param[in]  argus   DataBatch.getStreams()
-   * @param[in]  size    DataBatch.getSize()
-   * @param[in]  dataId  sub dataprovider id (in MultiDataProvider)
-   */
-  void appendArguments(const std::vector<Argument>& argus,
-                       int size,
-                       int dataId) {
-    size_ += size;
-    for (const auto& argu : argus) {
-      data_.push_back(argu);
-      data_.back().dataId = dataId;
-    }
-  }
-
- protected:
-  /**
-   * @brief batch size
-   */
-  int64_t size_;
-  /**
-   * @brief A batch data consist of a Argument vector,
-   * An argument corresponds to a type of input data.
-   */
-  std::vector<Argument> data_;
-};
-
-class BufferBatch {
- public:
-  BufferBatch() {
-    hlStream_ = HPPL_STREAM_DEFAULT;
-    hlEvent_ = NULL;
-    batchData_ = NULL;
-  }
-  ~BufferBatch() {
-    if (hlEvent_) {
-      hl_destroy_event(hlEvent_);
-      hlEvent_ = NULL;
-    }
-    delete batchData_;
-    batchData_ = NULL;
-  }
-
-  void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
-  DataBatch* getDataBatch() { return batchData_; }
-
-  void setCuStream(hl_stream_t stream) { hlStream_ = stream; }
-  hl_stream_t getCuStream() const { return hlStream_; }
-
-  void setCuEvent(hl_event_t event) { hlEvent_ = event; }
-
-  hl_event_t getCuEvent() const { return hlEvent_; }
-
-  void createCuEvent() {
-    if (!hlEvent_) {
-      hlStream_ = HPPL_STREAM_1;
-      hl_create_event(&hlEvent_);
-    }
-  }
-
-  void syncEvent() {
-    if (hlEvent_) {
-      hl_stream_wait_event(hlStream_, hlEvent_);
-    }
-  }
-
-  void swap(BufferBatch* bufBatch);
-  void clone(DataBatch* srcBatch, bool useGpu);
-
- protected:
-  DataBatch* batchData_;
-  hl_stream_t hlStream_;
-  hl_event_t hlEvent_;
-};
-
-class DataProvider;
-typedef std::shared_ptr<DataProvider> DataProviderPtr;
-
-typedef Queue<BufferBatch*> BufferBatchQueue;
-
-class DoubleBuffer {
- public:
-  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
-  virtual ~DoubleBuffer();
-  void removeOneBatch(DataBatch* dataBatch);
-
-  void setBatchSize(int64_t newBatchSize) { batchSize_ = newBatchSize; }
-
-  int64_t getBatchSize() { return batchSize_; }
-
-  void startAsyncLoad();
-  void finishAsyncLoad() {
-    stopping_ = true;
-    taskReadySem_.post();
-    if (asyncLoader_) {
-      asyncLoader_->join();
-    }
-  }
-
-  void setPending(bool pending) { pending_ = pending; }
-
- protected:
-  virtual void asyncLoadBatch();
-  void insertOneBatch(DataBatch* batch);
-
-  DataProvider* dataPool_;
-  bool useGpu_;
-  int32_t batchSize_;
-  ThreadLocal<BufferBatchPtr> usingBatch_;
-  BufferBatchQueue* dataQueue_;
-  BufferBatchQueue* bufferQueue_;
-  std::unique_ptr<std::thread> asyncLoader_;
-  Semaphore taskReadySem_;
-  bool stopping_;
-  bool pending_;
-};
-
-/**
- * @brief Base class for DataProvider, which supplies data for training
- * @note It can supplies multiple streams of data.
- * For typical supervised training, there are two streams:
- * one is for input, one is for label.
- */
-class DataProvider {
- public:
-  static ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool> registrar_;
-  static DataProvider* create(const DataConfig& config,
-                              const ModelConfig& modelConfig,
-                              bool useGpu = FLAGS_use_gpu);
-
-  /**
-   * @brief create only used for unittest.
-   */
-  inline static DataProvider* create(const DataConfig& config,
-                                     bool useGpu = FLAGS_use_gpu) {
-    return create(config, ModelConfig(), useGpu);
-  }
-
-  DataProvider(const DataConfig& config, bool useGpu)
-      : config_(config),
-        skipShuffle_(false),
-        usageRatio_(config.usage_ratio()),
-        useGpu_(useGpu) {
-    if (config_.async_load_data()) {
-      initAsyncLoader();
-    }
-  }
-  virtual ~DataProvider() {}
-
-  const DataConfig& getConfig() const { return config_; }
-
-  void setSkipShuffle() { skipShuffle_ = true; }
-
-  /**
-   * @brief Get next batch of training samples
-   * @param[in]    size    size of training samples to get
-   * @param[out]   batch   a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  int64_t getNextBatch(int64_t size, DataBatch* batch);
-
-  /**
-   * @brief Shuffle the data set
-   */
-  virtual void shuffle() = 0;
-
-  /**
-   * @brief reset all the value of index
-   * @note reset() must be called before any calls to getNextBatch()
-   * IMPORTANT: subclass reset() should always call the base class reset()
-   * at the end of the function
-   */
-  virtual void reset() {
-    if (doubleBuffer_ != nullptr) {
-      doubleBuffer_->startAsyncLoad();
-    }
-  }
-
-  /**
-   * @brief Get the size of training samples
-   * @return the number of training samples in the data set.
-   * @note return -1 to indicate unlimited number of samples.
-   */
-  virtual int64_t getSize() = 0;
-
-  /**
-   * @brief Get next batch training samples internally
-   * @param[in]    size      size of training samples to get
-   * @param[out]   batch     a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;
-
- protected:
-  DataConfig config_;
-  bool skipShuffle_;
-  float usageRatio_;
-  bool useGpu_;
-  std::unique_ptr<DoubleBuffer> doubleBuffer_;
-  ThreadLocal<std::vector<MatrixPtr>> constantSlots_;
-  /**
-   * @@brief Get next batch training samples from buffer
-   * @param[in]    size      size of training samples to get
-   * @param[out]   batch     a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  int64_t getNextBatchFromBuffer(int64_t size, DataBatch* batch);
-
-  void initAsyncLoader();
-};
-
-/**
- * A data provider which does nothing. It only serves as providing
- * necessary configurations such as stream_names
- */
-class DummyDataProvider : public DataProvider {
- public:
-  DummyDataProvider(const DataConfig& config, bool useGpu)
-      : DataProvider(config, useGpu) {}
-  virtual void shuffle() {}
-  virtual void reset() { DataProvider::reset(); }
-  virtual int64_t getSize() { return 0; }
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) {
-    (void)size;
-    (void)batch;
-    return 0;
-  }
-};
-
-/**
- * Data provider for one input and one integer label.
- */
-class SimpleDataProviderBase : public DataProvider {
- protected:
-  /// sample feature dimension
-  int64_t sampleDim_;
-  /// the number of samples
-  int64_t bufferCapacity_;
-  int64_t sampleNumInBuf_;
-  /// next item to read in buffer
-  int64_t nextItemIndex_;
-  /// some user defined info for validation
-  bool withInfo_;
-
-  /// data buffer: bufferCapacity_ * nDataDim_
-  CpuMatrixPtr hInputDataBuf_;
-
-  /// label buffer:bufferCapacity_ * 1
-  CpuIVectorPtr hInputLabelBuf_;
-
-  /// info buffer:bufferCapacity_ * 1
-  CpuIVectorPtr hInputInfoBuf_;
-
-  ThreadLocal<MatrixPtr> dataBatch_;
-  ThreadLocal<IVectorPtr> labelBatch_;
-  ThreadLocal<IVectorPtr> infoBatch_;
-
-  RWLock lock_;
-
- public:
-  SimpleDataProviderBase(const DataConfig& config, bool useGpu, bool withInfo);
-  ~SimpleDataProviderBase() {}
-
-  void shuffle();
-
-  virtual void reset();
-
-  virtual int64_t getSize();
-
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
-  /// return the number of samples in the buffer
-  int64_t fillBuffer();
-
- protected:
-  /**
-   * @brief Fill at most size samples into data and label.
-   *
-   * Each input is stored in contiguous memory locations in data.
-   *
-   * data[n * sampleDim_] .. data[n * sampleDim_ + sampleDim_ - 1] is for
-   * the input of the n-th sample.
-   *
-   * label[n] is the label for the n-th sample.
-   */
-  virtual int64_t fillBufferImp(real* data,
-                                int* label,
-                                int* info,
-                                int64_t size) = 0;
-};
-
-class SimpleDataProvider : public SimpleDataProviderBase {
- public:
-  SimpleDataProvider(const DataConfig& config, bool useGpu);
-  ~SimpleDataProvider();
-  virtual void reset();
-
- protected:
-  void loadData(const std::string& fileName);
-  void loadDataFile(const std::string& fileName);
-  virtual int64_t fillBufferImp(real* data,
-                                int* label,
-                                int* info,
-                                int64_t size);
-
- protected:
-  size_t currentSampleIndex_;
-  std::vector<int> labels_;
-  std::vector<real> data_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
deleted file mode 100644
index c6cd41de9a1a22470d8659eb90d1ac2b075b2df9..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Evaluator.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/utils/StringUtil.h"
-
-namespace paddle {
-
-/**
- * calculate sequence-to-sequence edit distance
- */
-class CTCErrorEvaluator : public Evaluator {
- private:
-  MatrixPtr outActivations_;
-  int numTimes_, numClasses_, numSequences_, blank_;
-  real deletions_, insertions_, substitutions_;
-  int seqClassficationError_;
-  mutable std::unordered_map<std::string, real> evalResults_;
-
-  std::vector<int> path2String(const std::vector<int>& path) {
-    std::vector<int> str;
-    str.clear();
-    int prevLabel = -1;
-    for (std::vector<int>::const_iterator label = path.begin();
-         label != path.end();
-         label++) {
-      if (*label != blank_ &&
-          (str.empty() || *label != str.back() || prevLabel == blank_)) {
-        str.push_back(*label);
-      }
-      prevLabel = *label;
-    }
-    return str;
-  }
-
-  std::vector<int> bestLabelSeq() {
-    std::vector<int> path;
-    path.clear();
-    real* acts = outActivations_->getData();
-    for (int i = 0; i < numTimes_; ++i) {
-      path.push_back(std::max_element(acts + i * numClasses_,
-                                      acts + (i + 1) * numClasses_) -
-                     (acts + i * numClasses_));
-    }
-    return path2String(path);
-  }
-
-  /* "sp, dp, ip" is the weighting parameter of "substitution, deletion,
-   * insertion"
-   * in edit-distance error */
-  real stringAlignment(std::vector<int>& gtStr,
-                       std::vector<int>& recogStr,
-                       bool backtrace = true,
-                       real sp = 1.0,
-                       real dp = 1.0,
-                       real ip = 1.0) {
-    std::vector<std::vector<int>> matrix;
-    int substitutions, deletions, insertions;
-    real distance;
-    int n = gtStr.size();
-    int m = recogStr.size();
-
-    if (n == 0) {
-      substitutions = 0;
-      deletions = 0;
-      insertions = m;
-      distance = m;
-    } else if (m == 0) {
-      substitutions = 0;
-      deletions = n;
-      insertions = 0;
-      distance = n;
-    } else {
-      substitutions = 0;
-      deletions = 0;
-      insertions = 0;
-      distance = 0;
-      // initialize the matrix
-      matrix.resize(n + 1);
-      for (int i = 0; i < n + 1; ++i) {
-        matrix[i].resize(m + 1);
-        for (int j = 0; j < m + 1; ++j) {
-          matrix[i][j] = 0;
-        }
-      }
-      for (int i = 0; i < n + 1; ++i) {
-        matrix[i][0] = i;
-      }
-      for (int j = 0; j < m + 1; ++j) {
-        matrix[0][j] = j;
-      }
-
-      // calculate the insertions, substitutions and deletions
-      for (int i = 1; i < n + 1; ++i) {
-        int s_i = gtStr[i - 1];
-        for (int j = 1; j < m + 1; ++j) {
-          int t_j = recogStr[j - 1];
-          int cost = (s_i == t_j) ? 0 : 1;
-          const int above = matrix[i - 1][j];
-          const int left = matrix[i][j - 1];
-          const int diag = matrix[i - 1][j - 1];
-          const int cell = std::min(above + 1, std::min(left + 1, diag + cost));
-          matrix[i][j] = cell;
-        }
-      }
-
-      if (backtrace) {
-        size_t i = n;
-        size_t j = m;
-        substitutions = 0;
-        deletions = 0;
-        insertions = 0;
-
-        while (i != 0 && j != 0) {
-          if (matrix[i][j] == matrix[i - 1][j - 1]) {
-            --i;
-            --j;
-          } else if (matrix[i][j] == matrix[i - 1][j - 1] + 1) {
-            ++substitutions;
-            --i;
-            --j;
-          } else if (matrix[i][j] == matrix[i - 1][j] + 1) {
-            ++deletions;
-            --i;
-          } else {
-            ++insertions;
-            --j;
-          }
-        }
-        while (i != 0) {
-          ++deletions;
-          --i;
-        }
-        while (j != 0) {
-          ++insertions;
-          --j;
-        }
-        int diff = substitutions + deletions + insertions;
-        if (diff != matrix[n][m]) {
-          LOG(ERROR) << "Found path with distance " << diff
-                     << " but Levenshtein distance is " << matrix[n][m];
-        }
-
-        distance = (sp * substitutions) + (dp * deletions) + (ip * insertions);
-      } else {
-        distance = (real)matrix[n][m];
-      }
-    }
-    real maxLen = std::max(m, n);
-    deletions_ += deletions / maxLen;
-    insertions_ += insertions / maxLen;
-    substitutions_ += substitutions / maxLen;
-
-    if (distance != 0) {
-      seqClassficationError_ += 1;
-    }
-
-    return distance / maxLen;
-  }
-
-  real editDistance(
-      real* output, int numTimes, int numClasses, int* labels, int labelsLen) {
-    numTimes_ = numTimes;
-    numClasses_ = numClasses;
-    blank_ = numClasses_ - 1;
-    outActivations_ = Matrix::create(output, numTimes, numClasses);
-    std::vector<int> recogStr, gtStr;
-    recogStr = bestLabelSeq();
-    for (int i = 0; i < labelsLen; ++i) {
-      gtStr.push_back(labels[i]);
-    }
-
-    return stringAlignment(gtStr, recogStr);
-  }
-
-  void storeLocalValues() const {
-    evalResults_["error"] = numSequences_ ? totalScore_ / numSequences_ : 0;
-    evalResults_["deletion_error"] =
-        numSequences_ ? deletions_ / numSequences_ : 0;
-    evalResults_["insertion_error"] =
-        numSequences_ ? insertions_ / numSequences_ : 0;
-    evalResults_["substitution_error"] =
-        numSequences_ ? substitutions_ / numSequences_ : 0;
-    evalResults_["sequence_error"] =
-        (real)seqClassficationError_ / numSequences_;
-  }
-
- public:
-  CTCErrorEvaluator()
-      : numTimes_(0),
-        numClasses_(0),
-        numSequences_(0),
-        blank_(0),
-        deletions_(0),
-        insertions_(0),
-        substitutions_(0),
-        seqClassficationError_(0) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_EQ(arguments.size(), (size_t)2);
-    Argument output, label;
-    output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
-    label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-    CHECK(label.sequenceStartPositions);
-    CHECK(label.ids);
-    size_t numSequences = label.sequenceStartPositions->getSize() - 1;
-    const int* labelStarts = label.sequenceStartPositions->getData(false);
-    const int* outputStarts = output.sequenceStartPositions->getData(false);
-    real totalErr = 0;
-    for (size_t i = 0; i < numSequences; ++i) {
-      real err = 0;
-      err = editDistance(
-          output.value->getData() + output.value->getWidth() * outputStarts[i],
-          outputStarts[i + 1] - outputStarts[i],
-          output.value->getWidth(),
-          label.ids->getData() + labelStarts[i],
-          labelStarts[i + 1] - labelStarts[i]);
-
-      totalErr += err;
-    }
-
-    return totalErr;
-  }
-
-  virtual void eval(const NeuralNetwork& nn) {
-    Evaluator::eval(nn);
-    std::vector<Argument> arguments;
-    arguments.reserve(config_.input_layers_size());
-    for (const std::string& name : config_.input_layers()) {
-      arguments.push_back(nn.getLayer(name)->getOutput());
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSequences_ += arguments[1].getNumSequences();
-  }
-
-  virtual void start() {
-    Evaluator::start();
-    numSequences_ = 0;
-    blank_ = 0;
-    deletions_ = 0;
-    insertions_ = 0;
-    substitutions_ = 0;
-    seqClassficationError_ = 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    storeLocalValues();
-    os << config_.name() << " error = " << evalResults_["error"];
-    os << " deletions error = " << evalResults_["deletion_error"];
-    os << " insertions error = " << evalResults_["insertion_error"];
-    os << " substitution error = " << evalResults_["substitution_error"];
-    os << " sequence error = " << evalResults_["sequence_error"];
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    double buf[6] = {totalScore_,
-                     (double)deletions_,
-                     (double)insertions_,
-                     (double)substitutions_,
-                     (double)seqClassficationError_,
-                     (double)numSequences_};
-    client->reduce(buf, buf, 6, FLAGS_trainer_id, 0);
-    totalScore_ = buf[0];
-    deletions_ = (real)buf[1];
-    insertions_ = (real)buf[2];
-    substitutions_ = (real)buf[3];
-    seqClassficationError_ = (int)buf[4];
-    numSequences_ = (int)buf[5];
-  }
-
-  void getNames(std::vector<std::string>* names) {
-    storeLocalValues();
-    names->reserve(names->size() + evalResults_.size());
-    for (auto it = evalResults_.begin(); it != evalResults_.end(); ++it) {
-      names->push_back(config_.name() + "." + it->first);
-    }
-  }
-
-  real getValue(const std::string& name, Error* err) const {
-    storeLocalValues();
-
-    std::vector<std::string> buffers;
-    paddle::str::split(name, '.', &buffers);
-    auto it = evalResults_.find(buffers[buffers.size() - 1]);
-
-    if (it == evalResults_.end()) {
-      *err = Error("Evaluator does not have the key %s", name.c_str());
-      return 0.0f;
-    }
-
-    return it->second;
-  }
-
-  std::string getType(const std::string& name, Error* err) const {
-    this->getValue(name, err);
-    if (!err->isOK()) {
-      return "";
-    }
-    return "ctc_edit_distance";
-  }
-};
-
-REGISTER_EVALUATOR(ctc_edit_distance, CTCErrorEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
deleted file mode 100644
index a2216293b1ab3a32e9cc903b805ca0aca10d58c1..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <set>
-#include <vector>
-
-#include "paddle/math/Vector.h"
-#include "paddle/utils/StringUtil.h"
-
-#include "Evaluator.h"
-
-namespace paddle {
-
-/**
- * Chunk evaluator is used to evaluate segment labelling accuracy for a
- * sequence. It calculates the chunk detection F1 score.
- *
- * A chunk is correctly detected if its beginning, end and type are correct.
- * Other chunk type is ignored.
- * For each label in the label sequence, we have
- *
- * @code
- * tagType = label % numTagType
- * chunkType = label / numTagType
- * otherChunkType = numChunkTypes
- * @endcode
- *
- * The total number of different labels is numTagType*numChunkTypes+1
- * We support 4 labelling scheme
- * The tag type for each of the scheme is shown as follows:
- *
- * @code
- *  Scheme Begin Inside End   Single
- *   plain  0     -      -     -
- *   IOB    0     1      -     -
- *   IOE    -     0      1     -
- *   IOBES  0     1      2     3
- * @endcode
- *
- * 'plain' means the whole chunk must contain exactly the same chunk label.
- */
-class ChunkEvaluator : public Evaluator {
-  int otherChunkType_;
-  int numChunkTypes_;  // number of chunk types besides other chunk type
-  int numTagTypes_;
-  int tagBegin_;
-  int tagInside_;
-  int tagEnd_;
-  int tagSingle_;
-
-  int64_t numLabelSegments_;
-  int64_t numOutputSegments_;
-  int64_t numCorrect_;
-
-  struct Segment {
-    int begin;
-    int end;
-    int type;
-    bool operator==(const Segment& y) const {
-      return begin == y.begin && end == y.end && type == y.type;
-    }
-  };
-
-  std::vector<Segment> labelSegments_;
-  std::vector<Segment> outputSegments_;
-  std::set<int> excludedChunkTypes_;
-  mutable std::unordered_map<std::string, real> values_;
-
- public:
-  virtual void init(const EvaluatorConfig& config) {
-    Evaluator::init(config);
-    if (config.chunk_scheme() == "IOB") {
-      numTagTypes_ = 2;
-      tagBegin_ = 0;
-      tagInside_ = 1;
-      tagEnd_ = -1;
-      tagSingle_ = -1;
-    } else if (config.chunk_scheme() == "IOE") {
-      numTagTypes_ = 2;
-      tagBegin_ = -1;
-      tagInside_ = 0;
-      tagEnd_ = 1;
-      tagSingle_ = -1;
-    } else if (config.chunk_scheme() == "IOBES") {
-      numTagTypes_ = 4;
-      tagBegin_ = 0;
-      tagInside_ = 1;
-      tagEnd_ = 2;
-      tagSingle_ = 3;
-    } else if (config.chunk_scheme() == "plain") {
-      numTagTypes_ = 1;
-      tagBegin_ = -1;
-      tagInside_ = -1;
-      tagEnd_ = -1;
-      tagSingle_ = -1;
-    } else {
-      LOG(FATAL) << "Unknown chunk scheme: " << config.chunk_scheme();
-    }
-    CHECK(config.has_num_chunk_types()) << "Missing num_chunk_types in config";
-    otherChunkType_ = numChunkTypes_ = config.num_chunk_types();
-
-    // the chunks of types in excludedChunkTypes_ will not be counted
-    auto& tmp = config.excluded_chunk_types();
-    excludedChunkTypes_.insert(tmp.begin(), tmp.end());
-  }
-
-  virtual void start() {
-    Evaluator::start();
-    numLabelSegments_ = 0;
-    numOutputSegments_ = 0;
-    numCorrect_ = 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    storeLocalValues();
-    os << config_.name() << "=" << values_["F1-score"]
-       << " true_chunks=" << numLabelSegments_
-       << " result_chunks=" << numOutputSegments_
-       << " correct_chunks=" << numCorrect_;
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    int64_t buf[3] = {numLabelSegments_, numOutputSegments_, numCorrect_};
-    client->reduce(buf, buf, 3, FLAGS_trainer_id, 0);
-    numLabelSegments_ = buf[0];
-    numOutputSegments_ = buf[1];
-    numCorrect_ = buf[2];
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_EQ(arguments.size(), (size_t)2);
-    IVectorPtr& output = arguments[0].ids;
-    IVectorPtr& label = arguments[1].ids;
-    CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
-    auto sequenceStartPositions =
-        arguments[1].sequenceStartPositions->getVector(false);
-    CHECK_EQ(output->getSize(), label->getSize());
-    CHECK(sequenceStartPositions);
-    size_t numSequences = sequenceStartPositions->getSize() - 1;
-    const int* starts = sequenceStartPositions->getData();
-    for (size_t i = 0; i < numSequences; ++i) {
-      eval1(output->getData() + starts[i],
-            label->getData() + starts[i],
-            starts[i + 1] - starts[i]);
-    }
-    return 0;
-  }
-
-  void eval1(int* output, int* label, int length) {
-    getSegments(output, length, outputSegments_);
-    getSegments(label, length, labelSegments_);
-    size_t i = 0, j = 0;
-    while (i < outputSegments_.size() && j < labelSegments_.size()) {
-      if (outputSegments_[i] == labelSegments_[j] &&
-          excludedChunkTypes_.count(outputSegments_[i].type) != 1) {
-        ++numCorrect_;
-      }
-      if (outputSegments_[i].end < labelSegments_[j].end) {
-        ++i;
-      } else if (outputSegments_[i].end > labelSegments_[j].end) {
-        ++j;
-      } else {
-        ++i;
-        ++j;
-      }
-    }
-    for (auto& segment : labelSegments_) {
-      if (excludedChunkTypes_.count(segment.type) != 1) ++numLabelSegments_;
-    }
-    for (auto& segment : outputSegments_) {
-      if (excludedChunkTypes_.count(segment.type) != 1) ++numOutputSegments_;
-    }
-  }
-
-  void getSegments(int* label, int length, std::vector<Segment>& segments) {
-    segments.clear();
-    segments.reserve(length);
-    int chunkStart = 0;
-    bool inChunk = false;
-    int tag = -1;
-    int type = otherChunkType_;
-    for (int i = 0; i < length; ++i) {
-      int prevTag = tag;
-      int prevType = type;
-      CHECK_LE(label[i], numChunkTypes_ * numTagTypes_);
-      tag = label[i] % numTagTypes_;
-      type = label[i] / numTagTypes_;
-      if (inChunk && isChunkEnd(prevTag, prevType, tag, type)) {
-        Segment segment{
-            chunkStart,  // begin
-            i - 1,       // end
-            prevType,
-        };
-        segments.push_back(segment);
-        inChunk = false;
-      }
-      if (isChunkBegin(prevTag, prevType, tag, type)) {
-        chunkStart = i;
-        inChunk = true;
-      }
-    }
-    if (inChunk) {
-      Segment segment{
-          chunkStart,  // begin
-          length - 1,  // end
-          type,
-      };
-      segments.push_back(segment);
-    }
-  }
-
-  // whether (prevTag, prevType) is the end of a chunk
-  bool isChunkEnd(int prevTag, int prevType, int tag, int type) {
-    if (prevType == otherChunkType_) return false;
-    if (type == otherChunkType_) return true;
-    if (type != prevType) return true;
-    if (prevTag == tagBegin_) return tag == tagBegin_ || tag == tagSingle_;
-    if (prevTag == tagInside_) return tag == tagBegin_ || tag == tagSingle_;
-    if (prevTag == tagEnd_) return true;
-    if (prevTag == tagSingle_) return true;
-    return false;
-  }
-
-  // whether (tag, type) is the beginning of a chunk
-  bool isChunkBegin(int prevTag, int prevType, int tag, int type) {
-    if (prevType == otherChunkType_) return type != otherChunkType_;
-    if (type == otherChunkType_) return false;
-    if (type != prevType) return true;
-    if (tag == tagBegin_) return true;
-    if (tag == tagInside_) return prevTag == tagEnd_ || prevTag == tagSingle_;
-    if (tag == tagEnd_) return prevTag == tagEnd_ || prevTag == tagSingle_;
-    if (tag == tagSingle_) return true;
-    return false;
-  }
-
-  // three metrics: precision, recall and F1-score
-  void getNames(std::vector<std::string>* names) {
-    storeLocalValues();
-    names->reserve(names->size() + values_.size());
-    for (auto it = values_.begin(); it != values_.end(); ++it) {
-      names->push_back(config_.name() + "." + it->first);
-    }
-  }
-
-  // get value by field name
-  real getValue(const std::string& name, Error* err) const {
-    storeLocalValues();
-    std::vector<std::string> buffers;
-    paddle::str::split(name, '.', &buffers);
-    auto it = values_.find(buffers.back());
-    if (it == values_.end()) {  // not found
-      *err = Error("No such key %s", name.c_str());
-      return 0.0f;
-    }
-
-    return it->second;
-  }
-
-  // get type of evaluator
-  std::string getType(const std::string& name, Error* err) const {
-    this->getValue(name, err);
-    if (!err->isOK()) {
-      return "";
-    }
-    return "chunk";
-  }
-
- private:
-  void storeLocalValues() const {
-    CHECK_GE(numOutputSegments_, 0);
-    CHECK_GE(numLabelSegments_, 0);
-    double precision =
-        !numOutputSegments_ ? 0 : (double)numCorrect_ / numOutputSegments_;
-    double recall =
-        !numLabelSegments_ ? 0 : (double)numCorrect_ / numLabelSegments_;
-    values_["precision"] = precision;
-    values_["recall"] = recall;
-    values_["F1-score"] =
-        !numCorrect_ ? 0 : 2 * precision * recall / (precision + recall);
-  }
-};
-
-REGISTER_EVALUATOR(chunk, ChunkEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
deleted file mode 100644
index ddb8ebca784db4a83c328ff75f5c50c7aecd7352..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Evaluator.h"
-#include "paddle/gserver/layers/DetectionUtil.h"
-
-using std::map;
-using std::vector;
-using std::pair;
-using std::make_pair;
-
-namespace paddle {
-
-/**
- * @brief detection map Evaluator
- *
- * The config file api is detection_map_evaluator.
- */
-class DetectionMAPEvaluator : public Evaluator {
- public:
-  DetectionMAPEvaluator()
-      : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {}
-
-  virtual void start() {
-    Evaluator::start();
-    allTruePos_.clear();
-    allFalsePos_.clear();
-    numPos_.clear();
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    overlapThreshold_ = config_.overlap_threshold();
-    backgroundId_ = config_.background_id();
-    evaluateDifficult_ = config_.evaluate_difficult();
-    apType_ = config_.ap_type();
-
-    MatrixPtr detectTmpValue = arguments[0].value;
-    Matrix::resizeOrCreate(cpuOutput_,
-                           detectTmpValue->getHeight(),
-                           detectTmpValue->getWidth(),
-                           false,
-                           false);
-
-    MatrixPtr labelTmpValue = arguments[1].value;
-    Matrix::resizeOrCreate(cpuLabel_,
-                           labelTmpValue->getHeight(),
-                           labelTmpValue->getWidth(),
-                           false,
-                           false);
-
-    cpuOutput_->copyFrom(*detectTmpValue);
-    cpuLabel_->copyFrom(*labelTmpValue);
-
-    Argument label = arguments[1];
-    const int* labelIndex = label.sequenceStartPositions->getData(false);
-    size_t batchSize = label.getNumSequences();
-
-    vector<map<size_t, vector<NormalizedBBox>>> allGTBBoxes;
-    vector<map<size_t, vector<pair<real, NormalizedBBox>>>> allDetectBBoxes;
-
-    for (size_t n = 0; n < batchSize; ++n) {
-      map<size_t, vector<NormalizedBBox>> bboxes;
-      for (int i = labelIndex[n]; i < labelIndex[n + 1]; ++i) {
-        vector<NormalizedBBox> bbox;
-        getBBoxFromLabelData(cpuLabel_->getData() + i * 6, 1, bbox);
-        int c = cpuLabel_->getData()[i * 6];
-        bboxes[c].push_back(bbox[0]);
-      }
-      allGTBBoxes.push_back(bboxes);
-    }
-
-    size_t n = 0;
-    const real* cpuOutputData = cpuOutput_->getData();
-    for (size_t imgId = 0; imgId < batchSize; ++imgId) {
-      map<size_t, vector<pair<real, NormalizedBBox>>> bboxes;
-      size_t curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
-      while (curImgId == imgId && n < cpuOutput_->getHeight()) {
-        vector<real> label;
-        vector<real> score;
-        vector<NormalizedBBox> bbox;
-        getBBoxFromDetectData(cpuOutputData + n * 7, 1, label, score, bbox);
-        bboxes[label[0]].push_back(make_pair(score[0], bbox[0]));
-        ++n;
-        curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
-      }
-      allDetectBBoxes.push_back(bboxes);
-    }
-
-    for (size_t n = 0; n < batchSize; ++n) {
-      for (map<size_t, vector<NormalizedBBox>>::iterator it =
-               allGTBBoxes[n].begin();
-           it != allGTBBoxes[n].end();
-           ++it) {
-        size_t count = 0;
-        if (evaluateDifficult_) {
-          count = it->second.size();
-        } else {
-          for (size_t i = 0; i < it->second.size(); ++i)
-            if (!(it->second[i].isDifficult)) ++count;
-        }
-        if (numPos_.find(it->first) == numPos_.end() && count != 0) {
-          numPos_[it->first] = count;
-        } else {
-          numPos_[it->first] += count;
-        }
-      }
-    }
-
-    // calcTFPos
-    calcTFPos(batchSize, allGTBBoxes, allDetectBBoxes);
-
-    return 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    real mAP = calcMAP();
-    os << "Detection mAP=" << mAP;
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    LOG(FATAL) << "Distribute detection evaluation not implemented.";
-  }
-
- protected:
-  void calcTFPos(const size_t batchSize,
-                 const vector<map<size_t, vector<NormalizedBBox>>>& allGTBBoxes,
-                 const vector<map<size_t, vector<pair<real, NormalizedBBox>>>>&
-                     allDetectBBoxes) {
-    for (size_t n = 0; n < allDetectBBoxes.size(); ++n) {
-      if (allGTBBoxes[n].size() == 0) {
-        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
-                 it = allDetectBBoxes[n].begin();
-             it != allDetectBBoxes[n].end();
-             ++it) {
-          size_t label = it->first;
-          for (size_t i = 0; i < it->second.size(); ++i) {
-            allTruePos_[label].push_back(make_pair(it->second[i].first, 0));
-            allFalsePos_[label].push_back(make_pair(it->second[i].first, 1));
-          }
-        }
-      } else {
-        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
-                 it = allDetectBBoxes[n].begin();
-             it != allDetectBBoxes[n].end();
-             ++it) {
-          size_t label = it->first;
-          vector<pair<real, NormalizedBBox>> predBBoxes = it->second;
-          if (allGTBBoxes[n].find(label) == allGTBBoxes[n].end()) {
-            for (size_t i = 0; i < predBBoxes.size(); ++i) {
-              allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
-              allFalsePos_[label].push_back(make_pair(predBBoxes[i].first, 1));
-            }
-          } else {
-            vector<NormalizedBBox> gtBBoxes =
-                allGTBBoxes[n].find(label)->second;
-            vector<bool> visited(gtBBoxes.size(), false);
-            // Sort detections in descend order based on scores
-            std::sort(predBBoxes.begin(),
-                      predBBoxes.end(),
-                      sortScorePairDescend<NormalizedBBox>);
-            for (size_t i = 0; i < predBBoxes.size(); ++i) {
-              real maxOverlap = -1.0;
-              size_t maxIdx = 0;
-              for (size_t j = 0; j < gtBBoxes.size(); ++j) {
-                real overlap =
-                    jaccardOverlap(predBBoxes[i].second, gtBBoxes[j]);
-                if (overlap > maxOverlap) {
-                  maxOverlap = overlap;
-                  maxIdx = j;
-                }
-              }
-              if (maxOverlap > overlapThreshold_) {
-                if (evaluateDifficult_ ||
-                    (!evaluateDifficult_ && !gtBBoxes[maxIdx].isDifficult)) {
-                  if (!visited[maxIdx]) {
-                    allTruePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 1));
-                    allFalsePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 0));
-                    visited[maxIdx] = true;
-                  } else {
-                    allTruePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 0));
-                    allFalsePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 1));
-                  }
-                }
-              } else {
-                allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
-                allFalsePos_[label].push_back(
-                    make_pair(predBBoxes[i].first, 1));
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  real calcMAP() const {
-    real mAP = 0.0;
-    size_t count = 0;
-    for (map<size_t, size_t>::const_iterator it = numPos_.begin();
-         it != numPos_.end();
-         ++it) {
-      size_t label = it->first;
-      size_t labelNumPos = it->second;
-      if (labelNumPos == 0 || allTruePos_.find(label) == allTruePos_.end())
-        continue;
-      vector<pair<real, size_t>> labelTruePos = allTruePos_.find(label)->second;
-      vector<pair<real, size_t>> labelFalsePos =
-          allFalsePos_.find(label)->second;
-      // Compute average precision.
-      vector<size_t> tpCumSum;
-      getAccumulation(labelTruePos, &tpCumSum);
-      vector<size_t> fpCumSum;
-      getAccumulation(labelFalsePos, &fpCumSum);
-      std::vector<real> precision, recall;
-      size_t num = tpCumSum.size();
-      // Compute Precision.
-      for (size_t i = 0; i < num; ++i) {
-        CHECK_LE(tpCumSum[i], labelNumPos);
-        precision.push_back(static_cast<real>(tpCumSum[i]) /
-                            static_cast<real>(tpCumSum[i] + fpCumSum[i]));
-        recall.push_back(static_cast<real>(tpCumSum[i]) / labelNumPos);
-      }
-      // VOC2007 style
-      if (apType_ == "11point") {
-        vector<real> maxPrecisions(11, 0.0);
-        int startIdx = num - 1;
-        for (int j = 10; j >= 0; --j)
-          for (int i = startIdx; i >= 0; --i) {
-            if (recall[i] < j / 10.) {
-              startIdx = i;
-              if (j > 0) maxPrecisions[j - 1] = maxPrecisions[j];
-              break;
-            } else {
-              if (maxPrecisions[j] < precision[i])
-                maxPrecisions[j] = precision[i];
-            }
-          }
-        for (int j = 10; j >= 0; --j) mAP += maxPrecisions[j] / 11;
-        ++count;
-      } else if (apType_ == "Integral") {
-        // Nature integral
-        real averagePrecisions = 0.;
-        real prevRecall = 0.;
-        for (size_t i = 0; i < num; ++i) {
-          if (fabs(recall[i] - prevRecall) > 1e-6)
-            averagePrecisions += precision[i] * fabs(recall[i] - prevRecall);
-          prevRecall = recall[i];
-        }
-        mAP += averagePrecisions;
-        ++count;
-      } else {
-        LOG(FATAL) << "Unkown ap version: " << apType_;
-      }
-    }
-    if (count != 0) mAP /= count;
-    return mAP * 100;
-  }
-
-  void getAccumulation(vector<pair<real, size_t>> inPairs,
-                       vector<size_t>* accuVec) const {
-    std::stable_sort(
-        inPairs.begin(), inPairs.end(), sortScorePairDescend<size_t>);
-    accuVec->clear();
-    size_t sum = 0;
-    for (size_t i = 0; i < inPairs.size(); ++i) {
-      sum += inPairs[i].second;
-      accuVec->push_back(sum);
-    }
-  }
-
-  std::string getTypeImpl() const { return "detection_map"; }
-
-  real getValueImpl() const { return calcMAP(); }
-
- private:
-  real overlapThreshold_;  // overlap threshold when determining whether matched
-  bool evaluateDifficult_;  // whether evaluate difficult ground truth
-  size_t backgroundId_;     // class index of background
-  std::string apType_;      // how to calculate mAP (Integral or 11point)
-
-  MatrixPtr cpuOutput_;
-  MatrixPtr cpuLabel_;
-
-  map<size_t, size_t> numPos_;  // counts of true objects each classification
-  map<size_t, vector<pair<real, size_t>>>
-      allTruePos_;  // true positive prediction
-  map<size_t, vector<pair<real, size_t>>>
-      allFalsePos_;  // false positive prediction
-};
-
-REGISTER_EVALUATOR(detection_map, DetectionMAPEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
deleted file mode 100644
index 941fb8fb539d58cca22ecf563d2effa816243c3b..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ /dev/null
@@ -1,1361 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/gserver/evaluators/Evaluator.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
-
-DECLARE_int32(trainer_id);
-
-namespace paddle {
-
-void Evaluator::eval(const NeuralNetwork& nn) {
-  std::vector<Argument> arguments;
-  arguments.reserve(config_.input_layers_size());
-  for (const std::string& name : config_.input_layers()) {
-    arguments.push_back(nn.getLayer(name)->getOutput());
-  }
-  SetDevice device(arguments[0].deviceId);
-  real score = evalImp(arguments);
-  totalScore_ += score;
-  updateSamplesNum(arguments);
-}
-/**
- * @brief classification error Evaluator
- *
- * The config file api is classification_error_evaluator.
- */
-class ClassificationErrorEvaluator : public Evaluator {
- public:
-  /*
-  ClassificationErrorEvaluator() : totalScore2_(0) {}
-
-  virtual void start() {
-    Evaluator::start();
-    totalScore2_ = 0;
-    } */
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (3 == arguments.size()) {
-      numSamples_ += arguments[2].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  MatrixPtr calcError(std::vector<Argument>& arguments) {
-    CHECK_GE(arguments.size(), (size_t)2);
-    CHECK_LE(arguments.size(), (size_t)3);
-    MatrixPtr& output = arguments[0].value;
-    IVectorPtr& label = arguments[1].ids;
-    MatrixPtr& multiBinaryLabel = arguments[1].value;  // For multi binary label
-    bool supportWeight = (3 == arguments.size()) ? true : false;
-    MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-    if (nullptr == output ||
-        (nullptr == label && nullptr == multiBinaryLabel) ||
-        (supportWeight && nullptr == weight)) {
-      return 0;
-    }
-
-    if (label != nullptr) {
-      CHECK_EQ(label->getSize(), output->getHeight());
-    } else {
-      CHECK_EQ(multiBinaryLabel->getHeight(), output->getHeight());
-      CHECK_EQ(multiBinaryLabel->getWidth(), output->getWidth());
-    }
-    if (supportWeight) {
-      CHECK_EQ(output->getHeight(), weight->getHeight());
-      CHECK_EQ((size_t)1, weight->getWidth());
-    }
-
-    const MatrixPtr errorMat = Matrix::create(output->getHeight(),
-                                              1,
-                                              /* trans= */ false,
-                                              useGpu(arguments[0].deviceId));
-
-    errorMat->zeroMem();
-
-    if (label != nullptr) {
-      errorMat->classificationError(*output, *label, config_.top_k());
-    } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
-               dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
-      errorMat->classificationErrorMulti(
-          *output, *multiBinaryLabel, config_.classification_threshold());
-    } else {
-      errorMat->binaryClassificationError(
-          0, *output, *multiBinaryLabel, config_.classification_threshold());
-    }
-
-    if (supportWeight) {
-      errorMat->dotMul(*errorMat, *weight);
-    }
-    return errorMat;
-  }
-
-  void printStats(std::ostream& os) const {
-    if (config_.top_k() == 1) {
-      os << config_.name() << "="
-         << (numSamples_ ? totalScore_ / numSamples_ : 0);
-    } else {
-      os << " top_" << config_.top_k()
-         << "_error=" << (numSamples_ ? totalScore_ / numSamples_ : 0);
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    MatrixPtr errorMat = calcError(arguments);
-    return errorMat->getSum();
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const { return "classification_error"; }
-};
-
-/**
- * @brief sequence classification error Evaluator
- * @note sequence level classification error stats,
- * if any frame in one sequence has error, the sequence is error
- */
-class SequenceClassificationErrorEvaluator
-    : public ClassificationErrorEvaluator {
- public:
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSamples_ += arguments[0].getNumSequences();
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    auto sequenceStartPositions =
-        arguments[0].sequenceStartPositions->getVector(false);
-    CHECK(sequenceStartPositions != nullptr);
-    const int* starts = sequenceStartPositions->getData();
-
-    MatrixPtr errorMat = calcError(arguments);
-
-    int errCounter = 0;
-    CpuVector errorVec(0, nullptr);
-    for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) {
-      errorVec.subVecFrom(
-          errorMat->getData(), starts[i], starts[i + 1] - starts[i]);
-      if (errorVec.getSum() > 0) {
-        errCounter += 1;
-      }
-    }
-
-    return static_cast<real>(errCounter);
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const { return "seq_classification_error"; }
-};
-REGISTER_EVALUATOR(seq_classification_error,
-                   SequenceClassificationErrorEvaluator);
-/**
- * @brief sum Evaluator
- * Calculate the sum of output or label
- *
- * The config file api is sum_evaluator.
- */
-class SumEvaluator : public Evaluator {
- public:
-  SumEvaluator() : cpuLabel_(nullptr), cpuWeight_(nullptr) {}
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (2 == arguments.size()) {
-      numSamples_ += arguments[1].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    REGISTER_TIMER("SumEvaluator");
-    CHECK_GE(arguments.size(), (size_t)1);
-    CHECK_LE(arguments.size(), (size_t)2);
-    bool supportWeight = (2 == arguments.size()) ? true : false;
-    if (supportWeight) {
-      if (nullptr == arguments[1].value) {
-        return 0;
-      }
-      CHECK_EQ(arguments[1].value->getWidth(), (size_t)1);
-    }
-
-    // The sum of output
-    if (arguments[0].value) {
-      if (supportWeight) {
-        CHECK_EQ(arguments[0].value->getHeight(),
-                 arguments[1].value->getHeight());
-        MatrixPtr tmpMat = Matrix::create(arguments[0].value->getHeight(),
-                                          arguments[0].value->getWidth(),
-                                          /* trans= */ false,
-                                          arguments[0].value->useGpu());
-        tmpMat->copyFrom(*arguments[0].value);
-        tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
-        return tmpMat->getSum();
-      } else {
-        return arguments[0].value->getSum();
-      }
-      // The sum of label
-    } else if (arguments[0].ids) {
-      size_t insNum = arguments[0].ids->getSize();
-      IVectorPtr label = arguments[0].ids;
-      MatrixPtr weight = supportWeight ? arguments[1].value : nullptr;
-      if (dynamic_cast<GpuIVector*>(label.get())) {
-        IVector::resizeOrCreate(cpuLabel_, insNum, false);
-        cpuLabel_->copyFrom(*arguments[0].ids);
-
-        if (supportWeight) {
-          CHECK_EQ(insNum, arguments[1].value->getHeight());
-          Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-          cpuWeight_->copyFrom(*arguments[1].value);
-        }
-
-        label = cpuLabel_;
-        weight = cpuWeight_;
-      }
-
-      if (supportWeight) {
-        real score = 0.0;
-        int* labelD = label->getData();
-        real* weightD = weight->getData();
-        for (size_t i = 0; i < insNum; ++i) {
-          score += (labelD[i] * weightD[i]);
-        }
-        return score;
-      } else {
-        return label->getSum();
-      }
-    } else {
-      return 0;
-    }
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
- private:
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const { return "sum"; }
-};
-/**
- * @brief column sum Evaluator
- * @note column sum for the colIdx-th column *
- * - colIdx = 0: the 0-th column.
- * - colIdx > 0: the colIdx-th column.
- * - colIdx < 0: the last colIdx-th column.
- *
- * The config file api is column_sum_evaluator.
- *
- */
-class ColumnSumEvaluator : public Evaluator {
- public:
-  explicit ColumnSumEvaluator(int32_t colIdx)
-      : colIdx_(colIdx), colNum_(0), sum_(nullptr) {}
-
-  virtual void start() {
-    Evaluator::start();
-    if (nullptr != sum_) {
-      sum_->zeroMem();
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (2 == arguments.size()) {
-      numSamples_ += arguments[1].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    REGISTER_TIMER("ColumnSumEvaluator");
-    CHECK_GE(arguments.size(), (size_t)1);
-    CHECK_LE(arguments.size(), (size_t)2);
-    bool supportWeight = (2 == arguments.size()) ? true : false;
-    if (nullptr == arguments[0].value ||
-        (supportWeight && nullptr == arguments[1].value)) {
-      return 0;
-    }
-
-    size_t insNum = arguments[0].value->getHeight();
-    size_t colNum = arguments[0].value->getWidth();
-    if (nullptr == sum_) {
-      sum_ = Matrix::create((size_t)1, colNum, false, /* useGpu */ false);
-      colNum_ = colNum;
-      sum_->zeroMem();
-    } else {
-      CHECK_EQ(colNum, sum_->getWidth());
-    }
-
-    if (supportWeight) {
-      CHECK_EQ(insNum, arguments[1].value->getHeight());
-      CHECK_EQ((size_t)1, arguments[1].value->getWidth());
-      MatrixPtr tmpMat = Matrix::create(insNum, colNum);
-      if (arguments[0].value->useGpu()) {
-        tmpMat->copyFrom(*arguments[0].value);
-      }
-      if (!arguments[1].value->useGpu()) {
-        if (!arguments[0].value->useGpu()) {
-          tmpMat->rowScale(0, *arguments[0].value, *arguments[1].value);
-        } else {
-          tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
-        }
-      } else {
-        MatrixPtr tmp2 = Matrix::create(insNum, 1);
-        tmp2->copyFrom(*arguments[1].value);
-        if (!arguments[0].value->useGpu()) {
-          tmpMat->rowScale(0, *arguments[0].value, *tmp2);
-        } else {
-          tmpMat->rowScale(0, *tmpMat, *tmp2);
-        }
-      }
-      sum_->accumulateColSum(*tmpMat);
-    } else {
-      if (!arguments[0].value->useGpu()) {
-        sum_->accumulateColSum(*arguments[0].value);
-      } else {
-        MatrixPtr tmpMat = Matrix::create(insNum, colNum);
-        tmpMat->copyFrom(*arguments[0].value);
-        sum_->accumulateColSum(*tmpMat);
-      }
-    }
-    return 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    CHECK(colIdx_ + (int32_t)colNum_ >= 0 && colIdx_ - (int32_t)colNum_ < 0)
-        << "column index [" << colIdx_ << "] out of range [-" << colNum_ << ", "
-        << colNum_ << ")";
-    size_t colIdx = 0;
-    if (colIdx_ >= 0) {
-      colIdx = colIdx_;
-    } else {
-      colIdx = colNum_ + colIdx_;
-    }
-    os << config_.name() << "="
-       << (numSamples_ ? sum_->getElement(0, colIdx) / numSamples_ : 0);
-  }
-
-  void distributeEval(ParameterClient2* client) {
-    client->reduce(
-        sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, 0);
-    client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
-  }
-
- private:
-  int32_t colIdx_;
-  size_t colNum_;
-  MatrixPtr sum_; /* cpu matrix */
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const {
-    if (colIdx_ == -1)
-      return "last-column-sum";
-    else
-      return "column-sum";
-  }
-};
-
-void AucEvaluator::start() {
-  Evaluator::start();
-  memset(statPos_, 0, sizeof(statPos_));
-  memset(statNeg_, 0, sizeof(statNeg_));
-}
-
-real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
-  REGISTER_TIMER("AucEvaluator");
-  CHECK_GE(arguments.size(), (size_t)2);
-  CHECK_LE(arguments.size(), (size_t)3);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  MatrixPtr labelval = arguments[1].value;
-  bool supportWeight = (3 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-
-  if (nullptr == output || (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-  size_t insNum = output->getHeight();
-  size_t outputDim = output->getWidth();
-  // Copy label from value to a vector.
-  if (nullptr == label && nullptr != labelval) {
-    // label width is 1
-    CHECK_EQ(1U, labelval->getWidth());
-    VectorPtr vec =
-        Vector::create(labelval->getData(), insNum, output->useGpu());
-    label = vec->castToInt();
-  }
-
-  CHECK_EQ(insNum, label->getSize());
-  if (supportWeight) {
-    CHECK_EQ(insNum, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  CHECK(colIdx_ + (int32_t)outputDim >= 0 && colIdx_ - (int32_t)outputDim < 0)
-      << "column index [" << colIdx_ << "] out of range [-" << outputDim << ", "
-      << outputDim << ")";
-  realColumnIdx_ = 0;
-  if (colIdx_ >= 0) {
-    realColumnIdx_ = colIdx_;
-  } else {
-    realColumnIdx_ = outputDim + colIdx_;
-  }
-
-  if (dynamic_cast<GpuMatrix*>(output.get())) {
-    Matrix::resizeOrCreate(cpuOutput_,
-                           insNum,
-                           outputDim,
-                           /* trans=*/false,
-                           /* useGpu=*/false);
-    cpuOutput_->copyFrom(*output);
-    IVector::resizeOrCreate(cpuLabel_, insNum, false);
-    cpuLabel_->copyFrom(*label);
-
-    if (supportWeight) {
-      Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-      cpuWeight_->copyFrom(*weight);
-    }
-
-    output = cpuOutput_;
-    label = cpuLabel_;
-    weight = cpuWeight_;
-  }
-
-  real* outputD = output->getData();
-  int* labelD = label->getData();
-  real* weightD = supportWeight ? weight->getData() : nullptr;
-  size_t pos = realColumnIdx_;
-
-  for (size_t i = 0; i < insNum; ++i) {
-    real value = outputD[pos];
-    uint32_t binIdx = static_cast<uint32_t>(value * kBinNum_);
-    CHECK(binIdx <= kBinNum_) << "bin index [" << binIdx
-                              << "] out of range, predict value[" << value
-                              << "]";
-    real w = supportWeight ? weightD[i] : 1.0;
-    if (labelD[i] == kNegativeLabel_) {
-      statNeg_[binIdx] += w;
-    } else {
-      statPos_[binIdx] += w;
-    }
-    pos += outputDim;
-  }
-  return 0;
-}
-
-void AucEvaluator::distributeEval(ParameterClient2* client) {
-  client->reduce(statPos_, statPos_, kBinNum_ + 1, FLAGS_trainer_id, 0);
-  client->reduce(statNeg_, statNeg_, kBinNum_ + 1, FLAGS_trainer_id, 0);
-}
-
-double AucEvaluator::calcAuc() const {
-  double totPos = 0.0;
-  double totNeg = 0.0;
-  double totPosPrev = 0.0;
-  double totNegPrev = 0.0;
-  double auc = 0.0;
-
-  int64_t idx = kBinNum_;
-  while (idx >= 0) {
-    totPosPrev = totPos;
-    totNegPrev = totNeg;
-    totPos += statPos_[idx];
-    totNeg += statNeg_[idx];
-    auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
-    --idx;
-  }
-
-  if (totPos > 0.0 && totNeg > 0.0) {
-    return auc / totPos / totNeg;
-  } else {
-    return 0.0;
-  }
-}
-
-real AucEvaluator::getValueImpl() const { return calcAuc(); }
-
-std::string AucEvaluator::getTypeImpl() const {
-  if (colIdx_ == -1) {
-    return "last-column-auc";
-  } else {
-    return "auc";
-  }
-}
-
-// class RankAucEvaluator
-REGISTER_EVALUATOR(rankauc, RankAucEvaluator);
-
-void RankAucEvaluator::start() { Evaluator::start(); }
-void RankAucEvaluator::updateSamplesNum(
-    const std::vector<Argument>& arguments) {
-  numSamples_ += arguments[0].getNumSequences();
-}
-real RankAucEvaluator::evalImp(std::vector<Argument>& arguments) {
-  CHECK_GE(arguments.size(), 2U);
-  CHECK_LE(arguments.size(), 3U);
-  double batchAuc = 0.0;
-  output_ = arguments[0].value;
-  click_ = arguments[1].value;
-  size_t batchSize = output_->getHeight();
-  CHECK(!output_->useGpu()) << "RankAUC evaluator does not support GPU!";
-
-  if (arguments.size() == 3U) {
-    pv_ = arguments[2].value;
-  } else {
-    Matrix::resizeOrCreate(pv_, batchSize, 1, false, false);
-    std::fill(pv_->getData(), pv_->getData() + batchSize, 1.0);
-  }
-
-  real* outputData = output_->getData();
-  real* clickData = click_->getData();
-  real* pvData = pv_->getData();
-
-  auto startPos = arguments[0].sequenceStartPositions->getVector(false);
-  const int* startPosData = startPos->getData();
-  size_t batchNum = startPos->getSize() - 1;
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    batchAuc += calcRankAuc(outputData + beginPos,
-                            clickData + beginPos,
-                            pvData + beginPos,
-                            endPos - beginPos);
-  }
-  return batchAuc;
-}
-
-double RankAucEvaluator::calcRankAuc(real* outputData,
-                                     real* clickData,
-                                     real* pvData,
-                                     size_t size) {
-  outputPair_.clear();
-  for (size_t i = 0; i < size; ++i) {
-    outputPair_.push_back(std::make_pair(outputData[i], i));
-  }
-  std::sort(outputPair_.begin(),
-            outputPair_.end(),
-            [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-              return a.first > b.first;
-            });
-  double aucTmp = 0.0;
-  double clickSum = 0.0;
-  double oldClickSum = 0.0;
-  double noClick = 0.0;
-  double noClickSum = 0.0;
-
-  double lastScore = outputPair_[0].first + 1.0;
-  for (size_t i = 0; i < size; ++i) {
-    if (lastScore != outputPair_[i].first) {
-      aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
-      oldClickSum = clickSum;
-      noClick = 0.0;
-      lastScore = outputPair_[i].first;
-    }
-    size_t id = outputPair_[i].second;
-    noClick += pvData[id] - clickData[id];
-    noClickSum += noClick;
-    clickSum += clickData[id];
-  }
-  aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
-  return (clickSum * noClickSum) == 0.0 ? 0.0
-                                        : aucTmp / (clickSum * noClickSum);
-}
-
-std::string RankAucEvaluator::getTypeImpl() const { return "rankauc"; }
-
-// class PrecisionRecallEvaluator
-REGISTER_EVALUATOR(precision_recall, PrecisionRecallEvaluator);
-
-void PrecisionRecallEvaluator::start() {
-  Evaluator::start();
-  statsInfo_.clear();
-  values_.clear();
-}
-
-real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
-  REGISTER_TIMER("PrecisionRecallEvaluator");
-  CHECK_GE(arguments.size(), (size_t)2);
-  CHECK_LE(arguments.size(), (size_t)3);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  MatrixPtr multiBinaryLabel = arguments[1].value;
-  bool supportWeight = (3 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-  if (nullptr == output || (nullptr == label && nullptr == multiBinaryLabel) ||
-      (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-
-  size_t insNum = output->getHeight();
-  size_t outputDim = output->getWidth();
-  if (label != nullptr) {
-    CHECK_EQ(insNum, label->getSize());
-  } else {
-    CHECK_EQ(insNum, multiBinaryLabel->getHeight());
-    CHECK_EQ(outputDim, multiBinaryLabel->getWidth());
-  }
-  if (supportWeight) {
-    CHECK_EQ(insNum, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  if (statsInfo_.size() != outputDim) {
-    statsInfo_.clear();
-    statsInfo_.resize(outputDim);
-  }
-
-  isMultiBinaryLabel_ = (nullptr == label) ? true : false;
-  if (label != nullptr) {
-    if (dynamic_cast<GpuMatrix*>(output.get())) {
-      Matrix::resizeOrCreate(cpuOutput_, insNum, outputDim, false, false);
-      cpuOutput_->copyFrom(*output);
-      IVector::resizeOrCreate(cpuLabel_, insNum, false);
-      cpuLabel_->copyFrom(*label);
-      if (supportWeight) {
-        Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-        cpuWeight_->copyFrom(*weight);
-      }
-
-      output = cpuOutput_;
-      label = cpuLabel_;
-      weight = cpuWeight_;
-    }
-    calcStatsInfo(output, label, weight);
-  } else {
-    // Not support GPU for multi binary labels
-    CHECK(dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()));
-    calcStatsInfoMulti(output, multiBinaryLabel, weight);
-  }
-  return 0;
-}
-
-void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
-  PrintStatsInfo info;
-  bool containMacroMicroInfo = getStatsInfo(&info);
-  os << "positive_label=" << config_.positive_label()
-     << " precision=" << info.precision << " recall=" << info.recall
-     << " F1-score=" << info.f1;
-  if (containMacroMicroInfo) {
-    os << "macro-average-precision=" << info.macroAvgPrecision
-       << " macro-average-recall=" << info.macroAvgRecall
-       << " macro-average-F1-score=" << info.macroAvgF1Score;
-    if (!isMultiBinaryLabel_) {
-      // precision and recall are equal in this case
-      os << " micro-average-precision=" << info.microAvgPrecision;
-    } else {
-      os << " micro-average-precision=" << info.microAvgPrecision
-         << " micro-average-recall=" << info.microAvgRecall
-         << " micro-average-F1-score=" << info.microAvgF1Score;
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::calcStatsInfo(const MatrixPtr& output,
-                                             const IVectorPtr& label,
-                                             const MatrixPtr& weight) {
-  size_t insNum = output->getHeight();
-  size_t dim = output->getWidth();
-  real* outputD = output->getData();
-  int* labelD = label->getData();
-  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
-  for (size_t i = 0; i < insNum; ++i) {
-    CHECK_GE(labelD[i], 0);
-    CHECK_LT((size_t)labelD[i], dim);
-    size_t maxIdx = 0;
-    real maxValue = outputD[i * dim];
-    for (size_t j = 1; j < dim; ++j) {
-      size_t idx = i * dim + j;
-      if (maxValue < outputD[idx]) {
-        maxIdx = j;
-        maxValue = outputD[idx];
-      }
-    }
-
-    real w = (weightD != nullptr) ? weightD[i] : 1.0;
-    if (maxIdx == (size_t)labelD[i]) {
-      statsInfo_[maxIdx].TP += w;  // true positive for labelD[i]
-      // true negative for all labels except for labelD[i]
-      for (size_t j = 0; j < dim; ++j) {
-        statsInfo_[j].TN += w;
-      }
-      statsInfo_[maxIdx].TN -= w;
-    } else {
-      statsInfo_[labelD[i]].FN += w;  // false negative for labelD[i]
-      statsInfo_[maxIdx].FP += w;     // false positive for maxIdx
-      // true negatives for all labels except for maxIdx and labelD[i]
-      for (size_t j = 0; j < dim; ++j) {
-        statsInfo_[j].TN += w;
-      }
-      statsInfo_[maxIdx].TN -= w;
-      statsInfo_[labelD[i]].TN -= w;
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::calcStatsInfoMulti(const MatrixPtr& output,
-                                                  const MatrixPtr& label,
-                                                  const MatrixPtr& weight) {
-  size_t insNum = output->getHeight();
-  size_t dim = output->getWidth();
-  real* outputD = output->getData();
-  auto labelD = dynamic_cast<CpuSparseMatrix*>(label.get());
-  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
-  real threshold = config_.classification_threshold();
-  for (size_t i = 0; i < insNum; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      real w = (weightD != nullptr) ? weightD[i] : 1.0;
-      size_t idx = i * dim + j;
-      if (outputD[idx] < threshold) {
-        statsInfo_[j].TN += w;  // true negative
-      } else {
-        statsInfo_[j].FP += w;  // false positive
-      }
-    }
-
-    const int* cols = labelD->getRowCols(i);
-    for (size_t j = 0; j < labelD->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      real w = (weightD != nullptr) ? weightD[i] : 1.0;
-      size_t idx = i * dim + cols[j];
-      if (outputD[idx] < threshold) {
-        statsInfo_[cols[j]].FN += w;  // false negative
-        statsInfo_[cols[j]].TN -= w;  // true negative
-      } else {
-        statsInfo_[cols[j]].TP += w;  // true positive
-        statsInfo_[cols[j]].FP -= w;  // false positive
-      }
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::storeLocalValues() const {
-  if (this->values_.size() == 0) {
-    PrintStatsInfo info;
-    bool containMacroMicroInfo = getStatsInfo(&info);
-    values_["precision"] = info.precision;
-    values_["recal"] = info.recall;
-    values_["F1-score"] = info.f1;
-    if (containMacroMicroInfo) {
-      values_["macro-average-precision"] = info.macroAvgPrecision;
-      values_["macro-average-recall"] = info.macroAvgRecall;
-      values_["macro-average-F1-score"] = info.macroAvgF1Score;
-      if (!isMultiBinaryLabel_) {
-        // precision and recall are equal in this case
-        values_["micro-average-precision"] = info.microAvgPrecision;
-      } else {
-        values_["micro-average-precision"] = info.microAvgPrecision;
-        values_["micro-average-recall"] = info.microAvgRecall;
-        values_["micro-average-F1-score"] = info.microAvgF1Score;
-      }
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::getNames(std::vector<std::string>* names) {
-  this->storeLocalValues();
-  names->reserve(this->values_.size());
-  for (auto it = this->values_.begin(); it != this->values_.end(); ++it) {
-    names->push_back(this->config_.name() + "." + it->first);
-  }
-}
-
-real PrecisionRecallEvaluator::getValue(const std::string& name,
-                                        Error* err) const {
-  this->storeLocalValues();
-  std::vector<std::string> buffers;
-  paddle::str::split(name, '.', &buffers);
-  auto it = this->values_.find(buffers[buffers.size() - 1]);
-  if (it == this->values_.end()) {  // not found
-    *err = Error("No such key %s", name.c_str());
-    return .0f;
-  }
-
-  return it->second;
-}
-
-std::string PrecisionRecallEvaluator::getType(const std::string& name,
-                                              Error* err) const {
-  this->getValue(name, err);
-  if (!err->isOK()) {
-    return "";
-  }
-  return "precision_recall";
-}
-
-void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
-  size_t size = 4 * statsInfo_.size();
-  double* buf = new double[size];
-  for (size_t i = 0; i < statsInfo_.size(); ++i) {
-    buf[4 * i + 0] = statsInfo_[i].TP;
-    buf[4 * i + 1] = statsInfo_[i].TN;
-    buf[4 * i + 2] = statsInfo_[i].FP;
-    buf[4 * i + 3] = statsInfo_[i].FN;
-  }
-  client->reduce(buf, buf, size, FLAGS_trainer_id, 0);
-  for (size_t i = 0; i < statsInfo_.size(); ++i) {
-    statsInfo_[i].TP = buf[4 * i + 0];
-    statsInfo_[i].TN = buf[4 * i + 1];
-    statsInfo_[i].FP = buf[4 * i + 2];
-    statsInfo_[i].FN = buf[4 * i + 3];
-  }
-  delete[] buf;
-}
-
-bool PrecisionRecallEvaluator::getStatsInfo(
-    PrecisionRecallEvaluator::PrintStatsInfo* info) const {
-  int label = config_.positive_label();
-  if (label != -1) {
-    CHECK(label >= 0 && label < (int)statsInfo_.size())
-        << "positive_label [" << label << "] should be in range [0, "
-        << statsInfo_.size() << ")";
-    info->precision = calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
-    info->recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
-    info->f1 = calcF1Score(info->precision, info->recall);
-    return false;
-  }
-
-  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
-  // macro average method: precision = (precision1+precision2)/2
-  double microTotalTP = 0;
-  double microTotalFP = 0;
-  double microTotalFN = 0;
-  info->macroAvgPrecision = 0;
-  info->macroAvgRecall = 0;
-  size_t numLabels = statsInfo_.size();
-  for (size_t i = 0; i < numLabels; ++i) {
-    microTotalTP += statsInfo_[i].TP;
-    microTotalFP += statsInfo_[i].FP;
-    microTotalFN += statsInfo_[i].FN;
-    info->macroAvgPrecision +=
-        calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
-    info->macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
-  }
-  info->macroAvgPrecision /= numLabels;
-  info->macroAvgRecall /= numLabels;
-  info->macroAvgF1Score =
-      calcF1Score(info->macroAvgPrecision, info->macroAvgRecall);
-
-  info->microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
-  info->microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
-  info->microAvgF1Score =
-      calcF1Score(info->microAvgPrecision, info->microAvgRecall);
-  return true;
-}
-
-REGISTER_EVALUATOR(pnpair, PnpairEvaluator);
-void PnpairEvaluator::start() {
-  Evaluator::start();
-  memset(pairArray_, 0, sizeof(pairArray_));
-  predictArray_.clear();
-}
-
-real PnpairEvaluator::evalImp(std::vector<Argument>& arguments) {
-  CHECK_GE(arguments.size(), 3UL);
-  CHECK_LE(arguments.size(), 4UL);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  IVectorPtr info = arguments[2].ids;
-  bool supportWeight = (4 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[3].value : nullptr;
-  if (nullptr == output || nullptr == label ||
-      (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-  size_t height = output->getHeight();
-  size_t width = output->getWidth();
-  CHECK_EQ(height, label->getSize());
-  CHECK_EQ(height, info->getSize());
-  if (supportWeight) {
-    CHECK_EQ(height, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  if (dynamic_cast<GpuMatrix*>(output.get())) {
-    Matrix::resizeOrCreate(cpuOutput_, height, width, false, false);
-    IVector::resizeOrCreate(cpuLabel_, height, false);
-    IVector::resizeOrCreate(cpuInfo_, height, false);
-    cpuOutput_->copyFrom(*output);
-    cpuLabel_->copyFrom(*label);
-    cpuInfo_->copyFrom(*info);
-
-    output = cpuOutput_;
-    label = cpuLabel_;
-    info = cpuInfo_;
-
-    if (supportWeight) {
-      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
-      cpuWeight_->copyFrom(*weight);
-      weight = cpuWeight_;
-    }
-  }
-
-  real* outputs = output->getData();
-  int* labels = label->getData();
-  int* infos = info->getData();
-  real* weights = supportWeight ? weight->getData() : nullptr;
-  for (size_t i = 0; i < output->getHeight(); i++) {
-    real y1 = outputs[i * width + (width - 1)];
-    real w = supportWeight ? weights[i] : 1.0;
-    predictArray_.push_back(PredictionResult(y1, labels[i], infos[i], w));
-  }
-  return 0;
-}
-
-void PnpairEvaluator::stat(size_t start,
-                           size_t end,
-                           PredictionResult* answers,
-                           double& pos,
-                           double& neg,
-                           double& spe) {
-  for (size_t i = start; i < end; i++) {
-    for (size_t j = i + 1; j < end; j++) {
-      CHECK_EQ(answers[i].queryid, answers[j].queryid);
-      // The pair weight is the mean of the two samples' weight
-      double weight = (answers[i].weight + answers[j].weight) / 2.0;
-      if (answers[i].label != answers[j].label) {
-        if ((answers[i].out > answers[j].out &&
-             answers[i].label > answers[j].label) ||
-            (answers[i].out < answers[j].out &&
-             answers[i].label < answers[j].label)) {
-          pos += weight;
-        } else if ((answers[i].out > answers[j].out &&
-                    answers[i].label < answers[j].label) ||
-                   (answers[i].out < answers[j].out &&
-                    answers[i].label > answers[j].label)) {
-          neg += weight;
-        } else {
-          spe += weight;
-        }
-      }
-    }
-  }
-}
-
-void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
-  std::sort(predictArray.begin(),
-            predictArray.end(),
-            [](const PredictionResult& x, const PredictionResult& y) {
-              return x.queryid < y.queryid;
-            });
-
-  double pos = 0;
-  double neg = 0;
-  double special = 0;
-  auto start = predictArray.begin();
-  while (start != predictArray.end()) {
-    auto end = std::find_if(
-        start + 1, predictArray.end(), [=](const PredictionResult& x) {
-          return x.queryid != start->queryid;
-        });
-    CHECK(end != start);
-    stat(start - predictArray.begin(),
-         end - predictArray.begin(),
-         predictArray.data(),
-         pos,
-         neg,
-         special);
-
-    start = end;
-  }
-
-  pairArray_[0] += pos;
-  pairArray_[1] += neg;
-
-  LOG(INFO) << " calc total pos pair: " << pos
-            << " calc total neg pair: " << neg
-            << " calc total special pair: " << special;
-}
-
-std::string PnpairEvaluator::getTypeImpl() const { return "pnpair"; }
-
-ClassRegistrar<Evaluator> Evaluator::registrar_;
-Evaluator* Evaluator::create(const EvaluatorConfig& config) {
-  Evaluator* evaluator = registrar_.createByType(config.type());
-  evaluator->init(config);
-  return evaluator;
-}
-
-REGISTER_EVALUATOR(classification_error, ClassificationErrorEvaluator);
-REGISTER_EVALUATOR(sum, SumEvaluator);
-static InitFunction __reg_type_auc_sum__([]() {
-  Evaluator::registrar_.registerClass(
-      "last-column-sum", [] { return new ColumnSumEvaluator(-1); });
-  Evaluator::registrar_.registerClass("last-column-auc",
-                                      [] { return new AucEvaluator(-1); });
-});
-
-/**
- * @brief print value of each layer.
- *
- * The config file api is value_printer_evaluator.
- */
-class ValuePrinter : public NotGetableEvaluator {
- public:
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
-                                                      "layer=" + name + " ");
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(value_printer, ValuePrinter);
-
-/**
- * @brief print gradient of each layer.
- *
- * The config file api is gradient_printer_evaluator.
- */
-class GradientPrinter : public NotGetableEvaluator {
- public:
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.grad) {
-        std::ostringstream os;
-        argu.grad->print(os);
-        LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
-      }
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
-/**
- * @brief print row max id vctor of each layer
- *
- * The config file api is maxid_printer_evaluator.
- */
-class MaxIdPrinter : public NotGetableEvaluator {
- private:
-  IVectorPtr maxIds_;
-  MatrixPtr maxValues_;
-
- public:
-  MaxIdPrinter() {}
-
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.value) {
-        size_t height = argu.value->getHeight();
-        size_t width = config_.num_results();
-        IVector::resizeOrCreate(maxIds_, height * width, false);
-        Matrix::resizeOrCreate(maxValues_, height, width, false);
-        argu.value->rowMax(*maxIds_, *maxValues_);
-        std::ostringstream os;
-        int* ids = maxIds_->getData();
-        real* values = maxValues_->getData();
-        for (size_t i = 0; i < height; ++i) {
-          for (size_t j = 0; j < width; ++j) {
-            size_t pos = i * width + j;
-            os << ids[pos] << " : " << values[pos] << ", ";
-          }
-          os << std::endl;
-        }
-        LOG(INFO) << "layer=" << name << " row max id vector:\n" << os.str();
-      }
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
-/**
- * @brief print sequence max frames of each layer
- *
- * The config file api is maxframe_printer_evaluator.
- */
-class MaxFramePrinter : public NotGetableEvaluator {
- private:
-  IVectorPtr maxIds_;
-  MatrixPtr maxValues_;
-  MatrixPtr value_;
-
- public:
-  MaxFramePrinter() {
-    value_ =
-        Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, false);
-  }
-
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-
-      CHECK_EQ(argu.value->getWidth(), 1LU);
-      size_t numSequences = argu.getNumSequences();
-      const int* starts = argu.sequenceStartPositions->getData(false);
-
-      std::ostringstream os;
-      for (size_t i = 0; i < numSequences; ++i) {
-        size_t offset = starts[i];
-        size_t size = starts[i + 1] - starts[i];
-        value_->setData(argu.value->getData() + offset, 1LU, size);
-
-        size_t height = 1LU;
-        size_t width = std::min((size_t)config_.num_results(), size);
-        IVector::resizeOrCreate(maxIds_, height * width, false);
-        Matrix::resizeOrCreate(maxValues_, height, width, false);
-
-        value_->rowMax(*maxIds_, *maxValues_);
-
-        int* ids = maxIds_->getData();
-        real* values = maxValues_->getData();
-        for (size_t j = 0; j < width; ++j) {
-          os << ids[j] << " : " << values[j] << ", ";
-        }
-        os << "total " << size << " frames" << std::endl;
-      }
-      LOG(INFO) << "layer=" << name << " sequence max frames:\n" << os.str();
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
-
-/**
- * @brief print text according to index matrix and a dictionary.
- *
- * There can be multiple input to this layer:
- * - If there is only one input, the input must be a matrix containing
- *      the sequence of indices;
- * - If there are more than one input, the first input should be ids,
- *      and are interpreted as sample ids.
- *
- * The output format will be:
- *
- * - sequence without sub-sequence, and there is probability.
- *
- *     @code
- *      id \t prob space_seperated_tokens_from_dictionary_according_to_seq
- *     @endcode
- *
- * - sequence without sub-sequence, and there is not probability.
- *
- *     @code
- *      id \t space_seperated_tokens_from_dictionary_according_to_seq
- *     @endcode
- *
- * - sequence with sub-sequence, and there is not probability.
- *
- *     @code
- *      id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
- *      \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
- *      ...
- *     @endcode
- *
- * Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
- * with maxid (when generating) as an input.
- *
- * The config file api is seqtext_printer_evaluator.
- *
- */
-class SequenceTextPrinter : public NotGetableEvaluator {
- private:
-  /// dict_file, which contains a list of tokens
-  std::vector<std::string> dict_;
-  /// result_file, which is the output file
-  std::ofstream os_;
-  /// True/False, to indicate whether to use space to separate output tokens.
-  /// Default is True. No space is added if set to False.
-  bool delimited_;
-  /// store the cpu version of argument.ids
-  std::vector<IVectorPtr> cpuIds_;
-  /// store the probability associated with each sequence
-  std::vector<MatrixPtr> cpuIn_;
-
- public:
-  SequenceTextPrinter() {}
-
-  virtual void init(const EvaluatorConfig& config) {
-    Evaluator::init(config);
-    if (!config.dict_file().empty()) {
-      loadFileList(config.dict_file(), dict_);
-    }
-
-    os_.open(config.result_file(), std::ofstream::trunc);
-    CHECK(os_.is_open()) << "Failed to open file " << config.result_file();
-    delimited_ = config.delimited();
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_GE(arguments.size(), 1LU);
-    bool hasId = arguments.size() > 1;
-    size_t numSequences = arguments[0].getNumSequences();
-    if (hasId) {
-      CHECK_EQ(arguments[0].ids->getSize(), numSequences)
-          << "first input must be sample id.";
-    }
-    for (size_t i = hasId ? 1 : 0; i < arguments.size(); ++i) {
-      CHECK_EQ((size_t)arguments[i].getNumSequences(), numSequences);
-    }
-
-    auto resizeVector = [](IVectorPtr& dest, const IVectorPtr& src) {
-      if (src && src->useGpu()) {
-        IVector::resizeOrCreate(dest, src->getSize(), false);
-        dest->copyFrom(*src);
-      } else {
-        dest = src;
-      }
-    };
-
-    auto resizeMatrix = [](MatrixPtr& dest, const MatrixPtr& src) {
-      if (src && src->useGpu()) {
-        Matrix::resizeOrCreate(
-            dest, src->getHeight(), src->getWidth(), false, false);
-        dest->copyFrom(*src);
-      } else {
-        dest = src;
-      }
-    };
-
-    cpuIds_.resize(arguments.size());
-    cpuIn_.resize(arguments.size());
-    for (size_t i = 0; i < arguments.size(); ++i) {
-      resizeVector(cpuIds_[i], arguments[i].ids);
-      resizeMatrix(cpuIn_[i], arguments[i].in);
-    }
-
-    int* sampleIds = nullptr;
-    if (hasId) {
-      sampleIds = cpuIds_[0]->getData();
-    }
-
-    for (size_t i = 0; i < numSequences; ++i) {
-      os_ << (hasId ? sampleIds[i] : i);
-      for (size_t j = hasId ? 1 : 0; j < arguments.size(); ++j) {
-        int* output = cpuIds_[j]->getData();
-        const int* starts = arguments[j].sequenceStartPositions->getData(false);
-
-        auto seqPrint = [&](int start, int end) {
-          os_ << "\t";
-          for (int k = start; k < end; k++) {
-            int id = output[k];
-            os_ << (delimited_ ? " " : "");
-            if (!dict_.empty()) {
-              CHECK_LT((size_t)id, dict_.size());
-              os_ << dict_[id];
-            } else {
-              os_ << id;
-            }
-          }
-        };
-
-        if (arguments[j].hasSubseq()) {
-          // print sequence with sub-sequence
-          const int* subStarts =
-              arguments[j].subSequenceStartPositions->getData(false);
-          int subSeqId_start = 0;
-          int subSeqId_end = 0;
-          for (size_t k = 0; k < (size_t)arguments[j].getNumSubSequences() + 1;
-               ++k) {
-            if (starts[i] == subStarts[k]) subSeqId_start = k;
-            if (starts[i + 1] == subStarts[k]) subSeqId_end = k;
-          }
-          for (int k = subSeqId_start; k < subSeqId_end; k++) {
-            seqPrint(subStarts[k], subStarts[k + 1]);
-            os_ << std::endl;
-          }
-
-        } else {
-          // print sequence without sub-sequence
-          if (arguments[j].in) {  // beam print
-            real* probs = cpuIn_[j]->rowBuf(i);
-            os_ << std::endl;
-            int start = starts[i];
-            int seqEnd = starts[i + 1];
-            for (size_t k = 0; k < arguments[j].in->getWidth(); ++k) {
-              if (start == seqEnd) {
-                break;
-              }
-              int end = start + output[start] + 2;
-              CHECK_LE(end, seqEnd);
-              CHECK_EQ(output[end - 1], -1);
-              os_ << k << "\t" << probs[k];
-              seqPrint(start + 1, end - 1);
-              os_ << std::endl;
-              start = end;
-            }
-          } else {
-            seqPrint(starts[i], starts[i + 1]);
-          }
-        }
-      }
-      os_ << std::endl;
-    }
-    return 0;
-  }
-};
-REGISTER_EVALUATOR(seq_text_printer, SequenceTextPrinter);
-/**
- * @brief print classification error.
- *
- * The config file api is classification_error_printer_evaluator.
- */
-class ClassificationErrorPrinter : public ClassificationErrorEvaluator {
- public:
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    MatrixPtr errorMat = calcError(arguments);
-
-    std::ostringstream os;
-    errorMat->print(os);
-    LOG(INFO) << "Printer=" << config_.name() << " Classification Error:\n"
-              << os.str();
-
-    if (auto startPos = arguments[0].sequenceStartPositions) {
-      std::ostringstream os;
-      startPos->getVector(false)->print(os, startPos->getSize());
-      LOG(INFO) << "Printer=" << config_.name() << " sequence pos vector:\n"
-                << os.str();
-    }
-    return 0;
-  }
-};
-REGISTER_EVALUATOR(classification_error_printer, ClassificationErrorPrinter);
-
-std::string DummyEvaluator::getTypeImpl() const { return "dummy"; }
-
-}  // namespace paddle
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
deleted file mode 100644
index 42948f1097d9a12600f4b11646a47e45b9bf4e96..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/Evaluator.h
+++ /dev/null
@@ -1,510 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include "ModelConfig.pb.h"
-#include "paddle/parameter/Argument.h"
-#include "paddle/pserver/ParameterClient2.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Error.h"
-
-namespace paddle {
-
-class NeuralNetwork;
-/**
- * @def REGISTER_EVALUATOR
- * @brief Macro for registering evaluator class
- */
-
-#define REGISTER_EVALUATOR(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                \
-    Evaluator::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-/**
- * @brief Base class for Evaluator
- * Evaluating the performance of a model is very important.
- * It indicates how successful the scores(predictions) of a datasets
- * has been by a trained model.
- */
-class Evaluator {
- public:
-  static Evaluator* create(const EvaluatorConfig& config);
-
-  Evaluator() : numSamples_(0), totalScore_(0) {}
-
-  virtual ~Evaluator() {}
-
-  virtual void init(const EvaluatorConfig& config) { config_ = config; }
-
-  /**
-   * @brief start to evaluate some data
-   */
-  virtual void start() {
-    numSamples_ = 0;
-    totalScore_ = 0;
-  }
-
-  /**
-   * @brief Process a batch of data.
-   */
-  virtual void eval(const NeuralNetwork& nn);
-
-  /**
-   * @brief Process a batch of data.
-   * @return the score for the batch if it make sense to sum the score across
-   * batches.
-   * @note Otherwise evaluator should return 0 and override finish() and
-   * printStats() to do the right calculation.
-   */
-  virtual real evalImp(std::vector<Argument>& arguments) = 0;
-
-  /**
-   * @brief Update the number of processed samples
-   */
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSamples_ += arguments[0].getBatchSize();
-  }
-
-  /// finish() should be called before distributeEval
-  virtual void distributeEval(ParameterClient2* client) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  void mergeResultsOfAllClients(ParameterClient2* client) {
-    double data[2] = {totalScore_, numSamples_};
-    client->reduce(data, data, 2, FLAGS_trainer_id, 0);
-    totalScore_ = data[0];
-    numSamples_ = data[1];
-  }
-
-  /**
-   * @brief finish the evaluation.
-   */
-  virtual void finish() {}
-
-  /**
-   * @brief print the statistics of evaluate result
-   * @note finish() should be called before printStats
-   */
-  virtual void printStats(std::ostream& os) const {
-    os << config_.name() << "="
-       << (numSamples_ ? totalScore_ / numSamples_ : 0);
-  }
-
-  friend std::ostream& operator<<(std::ostream& os,
-                                  const Evaluator& evaluator) {
-    evaluator.printStats(os);
-    return os;
-  }
-
-  friend std::ostream&& operator<<(std::ostream&& os,  // NOLINT
-                                   const Evaluator& evaluator) {
-    evaluator.printStats(os);
-    return std::move(os);
-  }
-
-  static ClassRegistrar<Evaluator> registrar_;
-
-  /**
-   * @brief getNames will return all field names of current evaluator.
-   *
-   * The format of name is `evaluator_name.evaluator_fields`. If the evaluator
-   * has multiple field, the name could be `evaluator_name.field1`. For example
-   * the PrecisionRecallEvaluator contains `precision`, `recall` fields. The get
-   * names will return `precision_recall_evaluator.precision`,
-   * `precision_recall_evaluator.recal`, etc.
-   *
-   * Also, if current Evaluator is a combined evaluator. getNames will return
-   * all names of all evaluators inside the combined evaluator.
-   *
-   * @param names [out]: the field names of current evaluator.
-   * @note Never clear the names parameter inside getNames.
-   */
-  virtual void getNames(std::vector<std::string>* names) {
-    names->push_back(config_.name());
-  }
-
-  /**
-   * @brief getValue will return the current evaluate value of one field.
-   *
-   * @param name: The field name of current evaluator.
-   * @param err [out]: The error state.
-   *
-   * @return The evaluate value(metric).
-   */
-  virtual real getValue(const std::string& name, Error* err) const {
-    if (name != config_.name()) {
-      *err = Error("no such name of evaluator %s", name.c_str());
-      return .0f;
-    }
-    return this->getValueImpl();
-  }
-
-  /**
-   * @brief getType will return the evaluator type by field name.
-   *
-   * Evaluate Type is the current type of evaluator in string. Such as 'auc',
-   * 'precision_recall'. In combined evaluator, different name may get different
-   * evaluate type because it could be evaluated by different evaluator inside.
-   *
-   * @param name: The field name of current Evaluator.
-   * @param err: The error state. nullptr means don't care.
-   * @return the evaluator type string.
-   */
-  virtual std::string getType(const std::string& name, Error* err) const {
-    if (name != config_.name()) {
-      *err = Error("no such name of evaluator %s", name.c_str());
-      return std::string();
-    }
-    return this->getTypeImpl();
-  }
-
- protected:
-  /**
-   * @brief getValueImpl The simplest way to define getValue result. If this
-   * evaluator doesn't contain multiple fields, and do not throw any error, just
-   * implemented this method to get the evaluate result(metric).
-   * @return Evaluate result(metric).
-   */
-  virtual real getValueImpl() const {
-    return numSamples_ != .0 ? totalScore_ / numSamples_ : .0;
-  }
-
-  /**
-   * @brief getTypeImpl The simplest way to define getType result. If this
-   * evaluator doesn't combine many evaluators, the get type should only return
-   * itself type.
-   * @return Evaluator type.
-   */
-  virtual std::string getTypeImpl() const { return "base"; }
-
- protected:
-  EvaluatorConfig config_;
-  double numSamples_;
-  double totalScore_;
-};
-
-/**
- * @brief The NotGetableEvaluator class is the base class of evaluator that
- * cannot get value in runtime. The most NotGetableEvaluator is Printer
- * Evaluator, which is only used to debug network configuration.
- */
-class NotGetableEvaluator : public Evaluator {
-  // Evaluator interface
- public:
-  void getNames(std::vector<std::string>* names) {}
-
-  real getValue(const std::string& name, Error* err) const {
-    *err = Error("Not implemented");
-    return .0f;
-  }
-
-  std::string getType(const std::string& name, Error* err) const {
-    *err = Error("Not implemented");
-    return "";
-  }
-};
-
-class DummyEvaluator : public Evaluator {
- public:
-  DummyEvaluator() {}
-  virtual void init(const EvaluatorConfig&) {}
-  virtual void start() {}
-  virtual void eval(const NeuralNetwork&) {}
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    (void)arguments;
-    return -1;
-  }
-  virtual void finish() {}
-  virtual void printStats(std::ostream&) const {}
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const;
-};
-/**
- * @brief evaluate AUC using colIdx-th column as prediction.
- * The AUC(Area Under the Curve) is a common evaluation metric
- * for binary classification problems. It computes the area under
- * the receiver operating characteristic(ROC) curve.
- *
- * @note colIdx-th column
- *
- * - colIdx = 0: the 0-th column.
- * - colIdx > 0: the colIdx-th column.
- * - colIdx < 0: the last colIdx-th column.
- *
- * The config file api is auc_evaluator.
- *
- */
-class AucEvaluator : public Evaluator {
- public:
-  AucEvaluator(int32_t colIdx)
-      : colIdx_(colIdx),
-        realColumnIdx_(0),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void printStats(std::ostream& os) const {
-    os << config_.name() << "=" << calcAuc();
-  }
-
-  virtual void distributeEval(ParameterClient2* client);
-
- private:
-  static const uint32_t kBinNum_ = (1 << 24) - 1;
-  static const int kNegativeLabel_ = 0;
-  double statPos_[kBinNum_ + 1];
-  double statNeg_[kBinNum_ + 1];
-  int32_t colIdx_;
-  uint32_t realColumnIdx_;
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  AucEvaluator() {}
-
-  inline static double trapezoidArea(double X1,
-                                     double X2,
-                                     double Y1,
-                                     double Y2) {
-    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
-  }
-
-  double calcAuc() const;
-
-  // Evaluator interface
- protected:
-  real getValueImpl() const;
-  std::string getTypeImpl() const;
-};
-
-/**
- * @brief RankAucEvaluator calculates the AUC of each list (i.e., titles
- * under the same query), and averages them. Each list should be organized
- * as a sequence. The inputs of this evaluator is [output, click, pv]. If pv
- * is not provided, it will be set to 1. The types of click and pv are
- * dense value.
- */
-class RankAucEvaluator : public Evaluator {
- public:
-  // evaluate ranking AUC
-  virtual void start();
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments);
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
- private:
-  MatrixPtr output_;
-  MatrixPtr click_;
-  MatrixPtr pv_;
-  std::vector<std::pair<real, int>> outputPair_;
-
-  double calcRankAuc(real* outputData,
-                     real* clickData,
-                     real* pvData,
-                     size_t size);
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const;
-};
-
-/**
- * @brief precision, recall and f1 score Evaluator
- * \f[
- * precision = \frac{tp}{tp+tn} \\
- * recall=\frac{tp}{tp+fn} \\
- * f1=2*\frac{precsion*recall}{precision+recall}
- * \f]
- *
- * The config file api is precision_recall_evaluator.
- */
-class PrecisionRecallEvaluator : public Evaluator {
- public:
-  // Evaluate precision, recall and F1 score
-  PrecisionRecallEvaluator()
-      : isMultiBinaryLabel_(false),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void printStats(std::ostream& os) const;
-
-  virtual void distributeEval(ParameterClient2* client);
-
-  void getNames(std::vector<std::string>* names);
-
-  real getValue(const std::string& name, Error* err) const;
-
-  std::string getType(const std::string& name, Error* err) const;
-
-  struct StatsInfo {
-    /// numbers of true positives
-    double TP;
-    /// numbers of true negatives
-    double TN;
-    /// numbers of false positives
-    double FP;
-    /// numbers of false negatives
-    double FN;
-
-    StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
-  };
-
- private:
-  bool isMultiBinaryLabel_;
-  std::vector<StatsInfo> statsInfo_;
-
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  struct PrintStatsInfo {
-    double precision;
-    double recall;
-    double f1;
-    double macroAvgPrecision;
-    double macroAvgRecall;
-    double macroAvgF1Score;
-    double microAvgPrecision;
-    double microAvgRecall;
-    double microAvgF1Score;
-  };
-
-  bool getStatsInfo(PrintStatsInfo* info) const;
-
-  void calcStatsInfo(const MatrixPtr& output,
-                     const IVectorPtr& label,
-                     const MatrixPtr& weight);
-
-  void calcStatsInfoMulti(const MatrixPtr& output,
-                          const MatrixPtr& label,
-                          const MatrixPtr& weight);
-
-  inline static double calcPrecision(double TP, double FP) {
-    if (TP > 0.0 || FP > 0.0) {
-      return TP / (TP + FP);
-    } else {
-      return 1.0;
-    }
-  }
-
-  inline static double calcRecall(double TP, double FN) {
-    if (TP > 0.0 || FN > 0.0) {
-      return TP / (TP + FN);
-    } else {
-      return 1.0;
-    }
-  }
-
-  inline static double calcF1Score(double precision, double recall) {
-    if (precision > 0.0 || recall > 0.0) {
-      return 2 * precision * recall / (precision + recall);
-    } else {
-      return 0;
-    }
-  }
-
-  mutable std::unordered_map<std::string, real> values_;
-
-  void storeLocalValues() const;
-};
-
-/*
- * @brief positive-negative pair rate Evaluator
- *
- * The config file api is pnpair_evaluator.
- */
-class PnpairEvaluator : public Evaluator {
- public:
-  PnpairEvaluator()
-      : cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuInfo_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  struct PredictionResult {
-    PredictionResult(real __out, int __label, int __queryid, real __weight)
-        : out(__out), label(__label), queryid(__queryid), weight(__weight) {}
-    real out;
-    int label;
-    int queryid;
-    real weight;
-  };
-  std::vector<PredictionResult> predictArray_;
-  void printPredictResults() {
-    std::ofstream fs(FLAGS_predict_file);
-    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
-    for (auto& res : predictArray_) {
-      fs << res.out << " " << res.label << " " << res.queryid << std::endl;
-    }
-  }
-
-  void stat(size_t start,
-            size_t end,
-            PredictionResult* answers,
-            double& pos,
-            double& neg,
-            double& spe);
-  void calc(std::vector<PredictionResult>& predictArray);
-
-  virtual void finish() { calc(predictArray_); }
-
-  virtual void printStats(std::ostream& os) const {
-    os << " pos/neg=" << this->getValueImpl();
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    client->reduce(pairArray_, pairArray_, kPairArrayNum_, FLAGS_trainer_id, 0);
-    LOG(INFO) << " distribute eval calc total pos pair: " << pairArray_[0]
-              << " calc total neg pair: " << pairArray_[1];
-  }
-
- private:
-  static const uint32_t kPairArrayNum_ = 2;
-  double pairArray_[kPairArrayNum_];
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  IVectorPtr cpuInfo_;
-  MatrixPtr cpuWeight_;
-
-  // Evaluator interface
- protected:
-  real getValueImpl() const {
-    return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
-  }
-  std::string getTypeImpl() const;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
deleted file mode 100644
index 22cf5d265f429ecbcea1808a54c85d7e89f8bc99..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <vector>
-
-#include "ModelConfig.pb.h"
-#include "TrainerConfig.pb.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/layers/Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterUpdaterBase.h"
-#include "paddle/utils/Thread.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "paddle/gserver/evaluators/Evaluator.h"
-#endif
-
-namespace paddle {
-/**
- * @brief A gradient machine is capable of calculating some outputs given
- *        some inputs and performing gradient calculation based on the
- *        derivative from the outputs.
- *
- * A gradient machine can be either a full neural network or part of a neural
- * network.
- *
- * Usage for training:
- *
- *  1. Prepare inArgs. Put your input data into inArgs[i].value.
- *
- *  2. Call forward(inArgs, &outArgs)
- *
- *  3. Calculate gradient with respect to outArgs[i]->value
- *     and fill them into outArgs[i]->grad.
- *     This step can be skipped if your the outputs are from cost layers.
- *
- *  4. Call backward(). After backward, gradient of each parameter is
- *     accumulated to getParameters()[i]->getBuf(PARAMETER_GRADIENT)
- *
- *  5. Update parameter value getParameters()[i]->getBuf(PARAMETER_VALUE) using
- *     gradients.
- *
- *  6. Clear gradients to zero.
- *
- * Usage for prediction:
- *
- *  1. Prepare inArgs. Put your input data into inArgs[i].value.
- *
- *  2. Call forward(inArgs, &outArgs)
- *
- *  3. Obtain the prediction result from outArgs[i]
- */
-
-typedef std::vector<LayerStatePtr> MachineState;
-
-class GradientMachine;
-
-typedef std::shared_ptr<GradientMachine> GradientMachinePtr;
-
-class GradientMachine {
- public:
-  enum CreateMode {
-    kNormal = 0,
-    kSgdSparseCpuTraining = 3,
-    kTesting = 4,
-    kCustom = 10
-  };
-
-  /**
-   * Create a gradient machine from ModelConfig
-   * Parameter will have parameterTypes
-   */
-  static GradientMachine* create(
-      const ModelConfig& config,
-      int mode = kNormal,
-      const std::vector<ParameterType>& parameterTypes =
-          std::vector<ParameterType>{
-              PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM});
-
-  virtual ~GradientMachine() {}
-
-  /**
-   * Prefetch row ids of sparse parameter.
-   */
-  virtual void prefetch(const std::vector<Argument>& inArgs) { (void)inArgs; }
-
-  /**
-   * @brief Forward propagation.
-   *
-   * Calculate outputs (outArgs) based the inputs (inArgs)
-   *
-   * @note: if passType==PASS_TEST, then backward() should not be called
-   */
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType) = 0;
-
-  /**
-   * @brief Backward propagation.
-   *
-   * Calculate the gradient of inArgs and parameter.
-   *
-   * This function should only be called after a corresponding forward() call.
-   * The caller is responsible for filling the correct grad for the outArgs
-   * obtained using forward().
-   *
-   * It may also change the grad field for the inArgs supplied at forward()
-   */
-  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
-
-  /**
-   * Combine forward() and backward(). For multithread training, this
-   * may be faster.
-   *
-   * @note: passType PASS_TEST is not allowed for forwardBackward().
-   */
-  virtual void forwardBackward(const std::vector<Argument>& inArgs,
-                               std::vector<Argument>* outArgs,
-                               PassType passType,
-                               const UpdateCallback& callback = nullptr) {
-    forward(inArgs, outArgs, passType);
-    backward(callback);
-  }
-
-  virtual Argument getLayerOutput(const std::string& layerName) = 0;
-
-  // see comment in Layer.h for the function with the same name
-  virtual void resetState() {}
-
-  // set machine state
-  virtual void setState(const MachineState& machineState) {}
-
-  // save machine state
-  virtual void getState(MachineState& machineState) {}
-
-  virtual void onPassEnd() = 0;
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  /**
-   * Create an evaluator which can be used for eval()
-   */
-  virtual Evaluator* makeEvaluator() const = 0;
-
-  /**
-   * evaluate using the given evaluator
-   */
-  virtual void eval(Evaluator* evaluator) const = 0;
-#endif
-
-  std::vector<ParameterPtr>& getParameters() { return parameters_; }
-
-  std::vector<ParameterPtr>& getNonStaticParameters() {
-    if (nonStaticParameters_.empty()) {
-      for (auto para : parameters_) {
-        if (!para->isStatic()) {
-          nonStaticParameters_.push_back(para);
-        }
-      }
-    }
-    return nonStaticParameters_;
-  }
-
-  inline bool hasStaticParameters() {
-    return parameters_.size() != getNonStaticParameters().size();
-  }
-
-  /**
-   * @brief   Used before formal training, start work-threads and set
-   *          trainer Parameters;
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void start() {}
-
-  /**
-   * @brief   check  each work-thread whether is failed/error/finish,
-   *          if not, return ture, and yes return false.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void finish() {}
-
-  /**
-   * @brief   set the training status a "finished" value, the sub_work_threads
-   *          will option the change, and then exit.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual bool trainIsOn() { return true; }
-
-  /**
-   * @brief   when all or some of the sub-workThreads are suspended to waiting
-   *          controller's instructions, and after some processing done in the
-   *          controller, it will call this function to wake up all the pending
-   *          thread.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void restart() {}
-
-  /// Set the gradient of the output from outside.
-  virtual void setOutputGrad(const std::vector<Argument>& args) {
-    LOG(FATAL) << "Not implemented!";
-  }
-
-  void saveParameters(const std::string& dir) const;
-
-  void loadParameters(const std::string& dir);
-
-  void randParameters();
-
-  virtual void getStats(real& cost, int64_t& numProcessed) {
-    (void)cost;
-    (void)numProcessed;
-  }
-
-  /**
-   * @brief   Release the middle layer's output memory.
-   *
-   * @note    This function is used for memory optimization in inference.
-   */
-  virtual void releaseOutput() {}
-
- protected:
-  virtual void onLoadParameter() {}
-
-  std::vector<ParameterPtr> parameters_;
-  std::vector<ParameterPtr> nonStaticParameters_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
deleted file mode 100644
index ac60a3a3408d37b66cb712d893c6b93a1750f448..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ /dev/null
@@ -1,548 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Util.h"
-
-#include "NeuralNetwork.h"
-#include "hl_gpu.h"
-#include "paddle/utils/CustomStackTrace.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/gserver/layers/MKLDNNLayer.h"
-#endif
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "MultiNetwork.h"
-#include "RecurrentGradientMachine.h"
-#include "paddle/gserver/layers/AgentLayer.h"
-#endif
-
-namespace paddle {
-void parameterInitNN(int paramId,
-                     Parameter* para,
-                     std::vector<ParameterPtr>* sharedParams) {
-  // Create parameters values.
-  if (!para->useGpu() && sharedParams) {
-    para->enableSharedType(PARAMETER_VALUE,
-                           (*sharedParams)[paramId]->getBuf(PARAMETER_VALUE),
-                           (*sharedParams)[paramId]->getMat(PARAMETER_VALUE));
-  } else {
-    if (para->isSparseRemoteUpdate()) {
-      para->enableType(PARAMETER_VALUE,
-                       FLAGS_loadsave_parameters_in_pserver
-                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
-                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
-    } else {
-      para->enableType(PARAMETER_VALUE);
-    }
-  }
-  // Create parameter gradients.
-  if (para->isSparseRemoteUpdate() && !sharedParams) {
-    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
-  } else if (para->isGradSparseUpdate()) {
-    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_AUTO_GROW);
-  } else if (!para->isStatic()) {
-    para->enableType(PARAMETER_GRADIENT);
-  }
-}
-
-NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  if (config.type() == "recurrent_nn") {
-    return newNeuralNetwork("root");
-  } else if (config.type() == "multi_nn") {
-    return new MultiNetwork("root");
-  } else {
-    return newNeuralNetwork();
-  }
-#else
-  return new NeuralNetwork();
-#endif
-}
-
-std::map<std::string, bool> NeuralNetwork::dllInitMap;
-
-void NeuralNetwork::init(const ModelConfig& config,
-                         ParamInitCallback callback,
-                         const std::vector<ParameterType>& parameterTypes,
-                         bool useGpu) {
-  using std::placeholders::_1;
-  using std::placeholders::_2;
-  ParamInitCallback paramCallback = nullptr;
-  if (callback != nullptr) {
-    paramSelfInited_ = false;
-    paramCallback = callback;
-  } else {
-    paramSelfInited_ = true;
-    paramCallback = std::bind(parameterInitNN, _1, _2, nullptr);
-  }
-  config_ = config;
-
-  if (rootNetwork_ != nullptr) {
-    // direct use parameters_ and parameterMap_ from base network
-    CHECK_EQ((size_t)config.parameters_size(),
-             rootNetwork_->getParameters().size());
-    parameters_ = rootNetwork_->getParameters();
-    parameterMap_ = *(rootNetwork_->getParameterMap());
-  } else {
-    parameters_.reserve(config.parameters_size());
-    for (const auto& para_config : config.parameters()) {
-      auto parameter = std::make_shared<Parameter>(para_config,
-                                                   useGpu,
-                                                   /*initialize=*/false);
-      paramCallback(parameters_.size(), parameter.get());
-      if (!callback) {
-        for (ParameterType type :
-             (parameter->isStatic()
-                  ? std::vector<ParameterType>{PARAMETER_VALUE}
-                  : parameterTypes)) {
-          if (type != PARAMETER_VALUE && type != PARAMETER_GRADIENT) {
-            parameter->enableType(type);
-          }
-        }
-      }
-      parameter->setID(parameters_.size());
-      parameters_.push_back(parameter);
-      CHECK(!parameterMap_.count(parameter->getName()));
-      parameterMap_[parameter->getName()] = parameter;
-    }
-  }
-
-  auto layerCreate = [&](const LayerConfig& layer_config) {
-    auto layer = Layer::create(layer_config);
-    CHECK(layer) << "Create layer failed. Layer name:" << layer->getName();
-    layers_.push_back(layer);
-    CHECK(!layerMap_.count(layer->getName()));
-    layerMap_[layer->getName()] = layer;
-  };
-
-  auto subModelConfig = std::find_if(config.sub_models().begin(),
-                                     config.sub_models().end(),
-                                     [=](const SubModelConfig& sub_model) {
-                                       return sub_model.name() == subModelName_;
-                                     });
-  bool useSubModel = (subModelConfig != config.sub_models().end());
-  CHECK_EQ(useSubModel, !subModelName_.empty());
-  if (useSubModel) {
-    layers_.reserve(subModelConfig->layer_names_size());
-    for (const auto& layer_name : subModelConfig->layer_names()) {
-      auto layer_config =
-          std::find_if(config.layers().begin(),
-                       config.layers().end(),
-                       [=](const LayerConfig& layer_config) {
-                         return layer_config.name() == layer_name;
-                       });
-      CHECK(layer_config != config.layers().end());
-      layerCreate(*layer_config);
-    }
-  } else {
-    layers_.reserve(config.layers_size());
-    for (const auto& layer_config : config.layers()) {
-      bool useLayer = true;
-      if (config.has_external_config()) {
-        useLayer = true;
-        for (const auto& name : config.external_config().layer_names()) {
-          if (layer_config.name() == name) {
-            useLayer = false;
-            break;
-          }
-        }
-      }
-      if (useLayer) {
-        layerCreate(layer_config);
-      }
-    }
-  }
-
-  for (const auto& layer : layers_) {
-    layer->init(layerMap_, parameterMap_);
-    layer->initSubNetwork(this /*root*/, config_, parameterTypes, useGpu);
-  }
-
-  for (const auto& layer_name :
-       (useSubModel ? subModelConfig->input_layer_names()
-                    : config.input_layer_names())) {
-    auto it = layerMap_.find(layer_name);
-    CHECK(it != layerMap_.end());
-    dataLayers_.push_back(std::dynamic_pointer_cast<DataLayer>(it->second));
-  }
-
-  for (const auto& layer_name :
-       (useSubModel ? subModelConfig->output_layer_names()
-                    : config.output_layer_names())) {
-    auto it = layerMap_.find(layer_name);
-    CHECK(it != layerMap_.end());
-    outputLayers_.push_back(it->second);
-  }
-
-  for (const auto& layer : layers_) {
-    const auto& name = layer->getName();
-    bool isMiddleLayer = true;
-
-    // if data layer
-    for (const auto& dataLayer : dataLayers_) {
-      if (name == dataLayer->getName()) {
-        isMiddleLayer = false;
-        break;
-      }
-    }
-
-    // if output layer
-    for (const auto& dataLayer : outputLayers_) {
-      if (name == dataLayer->getName()) {
-        isMiddleLayer = false;
-        break;
-      }
-    }
-
-    if (isMiddleLayer) {
-      middleLayers_.push_back(layer);
-    }
-  }
-}
-
-void NeuralNetwork::connect(LayerPtr agentLayer,
-                            LayerPtr realLayer,
-                            int height) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
-  CHECK_NOTNULL(agent);
-  agent->setRealLayer(realLayer, height);
-#endif
-}
-
-void NeuralNetwork::connect(std::string agentLayerName,
-                            NeuralNetwork* srcNN,
-                            std::string realLayerName) {
-  connect(this->getLayer(agentLayerName), srcNN->getLayer(realLayerName));
-}
-
-void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
-  CHECK_EQ(inArgs.size(), dataLayers_.size());
-
-  if (paramSelfInited_) {
-    for (auto& para : parameters_) {
-      if (para->isSparseRemoteUpdate()) {
-        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-            para->getMat(PARAMETER_VALUE).get());
-        para->clearGradient();
-        if (mat) mat->clearIndices();
-      }
-    }
-  }
-
-  for (size_t i = 0; i != dataLayers_.size(); ++i) {
-    if (FLAGS_parallel_nn) {
-      const_cast<Argument&>(inArgs[i]).deviceId = -1;
-    }
-    dataLayers_[i]->setData(inArgs[i]);
-  }
-
-  for (auto& layer : layers_) {
-    layer->prefetch();
-  }
-
-  if (paramSelfInited_) {
-    for (auto& para : parameters_) {
-      if (para->isSparseRemoteUpdate()) {
-        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-            para->getMat(PARAMETER_VALUE).get());
-        mat->setupIndices();
-        auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
-            para->getMat(PARAMETER_GRADIENT).get());
-        matGrad->reserveStore();
-      }
-    }
-  }
-}
-
-void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
-                            std::vector<Argument>* outArgs,
-                            PassType passType) {
-  CHECK_EQ(inArgs.size(), dataLayers_.size());
-  outArgs->resize(outputLayers_.size());
-  for (size_t i = 0; i != dataLayers_.size(); ++i) {
-    dataLayers_[i]->setData(inArgs[i]);
-  }
-
-  gLayerStackTrace.set_stage(true);
-
-  {
-    for (auto& layer : layers_) {
-      REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str());
-      gLayerStackTrace.push(layer->getName());
-      layer->forward(passType);
-      gLayerStackTrace.pop(layer->getName());
-    }
-  }
-
-  outArgs->clear();
-  outArgs->reserve(outputLayers_.size());
-  for (auto& layer : outputLayers_) {
-    outArgs->push_back(layer->getOutput());
-  }
-}
-
-void NeuralNetwork::resetState() {
-  for (auto& layer : layers_) {
-    layer->resetState();
-  }
-}
-
-void NeuralNetwork::setState(const MachineState& machineState) {
-  for (size_t i = 0; i < layers_.size(); i++) {
-    if (machineState[i] != nullptr) {
-      layers_[i]->setState(machineState[i]);
-    }
-  }
-}
-
-void NeuralNetwork::getState(MachineState& machineState) {
-  machineState.clear();
-  machineState.reserve(layers_.size());
-  for (auto& layer : layers_) {
-    LayerStatePtr p = layer->getState();
-    machineState.push_back(p);
-  }
-}
-
-void NeuralNetwork::backward(const UpdateCallback& callback) {
-  gLayerStackTrace.set_stage(false);
-  FOR_EACH_R(layer, layers_) {
-    REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
-    gLayerStackTrace.push((*layer)->getName());
-    if ((*layer)->needGradient()) {
-      (*layer)->backward(callback);
-    }
-    gLayerStackTrace.pop((*layer)->getName());
-  }
-}
-
-void NeuralNetwork::finish() {
-#ifdef PADDLE_WITH_MKLDNN
-  FOR_EACH_R(layer, layers_) {
-    MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
-    if (dnnLayer) {
-      dnnLayer->convertWeightsToPaddle();
-    }
-  }
-#endif
-}
-
-Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
-  return getLayer(layerName)->getOutput();
-}
-
-void NeuralNetwork::onPassEnd() {
-  for (auto& layer : layers_) {
-    layer->onPassEnd();
-  }
-}
-
-void NeuralNetwork::releaseOutput() {
-  for (auto& layer : middleLayers_) {
-    Argument& arg = layer->getOutput();
-    arg.value.reset();
-  }
-}
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-class CombinedEvaluator : public Evaluator {
- public:
-  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
-    evaluators_.emplace_back(std::move(evaluator));
-  }
-  void start() override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->start();
-    }
-  }
-
-  void finish() override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->finish();
-    }
-  }
-
-  void eval(const NeuralNetwork& nn) override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->eval(nn);
-    }
-  }
-  real evalImp(std::vector<Argument>& arguments) override {
-    (void)arguments;
-    return -1;
-  }
-  void printStats(std::ostream& os) const override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->printStats(os);
-      os << ' ';
-    }
-  }
-
-  void distributeEval(ParameterClient2* client) override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->distributeEval(client);
-    }
-  }
-
- protected:
-  std::vector<std::unique_ptr<Evaluator>> evaluators_;
-
-  // Evaluator interface
- public:
-  /**
-   * @brief getNames will return all inside evaluators' names.
-   * @param names [out]: return names.
-   */
-  void getNames(std::vector<std::string>* names) override {
-    for (auto& eval : evaluators_) {
-      eval->getNames(names);
-    }
-  }
-
-  /**
-   * @brief getValue could get all inside evaluators' value.
-   */
-  real getValue(const std::string& name, Error* err) const override {
-    return this->getMethodHelper<real>(
-        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
-          return eval->getValue(name, err);
-        });
-  }
-
-  /**
-   * @brief getType could get all inside evaluators' type.
-   */
-  std::string getType(const std::string& name, Error* err) const override {
-    return this->getMethodHelper<std::string>(
-        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
-          return eval->getType(name, err);
-        });
-  }
-
- private:
-  template <typename T>
-  T getMethodHelper(const std::string& name,
-                    Error* err,
-                    const std::function<T(const std::unique_ptr<Evaluator>&)>&
-                        callback) const {
-    for (auto& eval : evaluators_) {
-      std::vector<std::string> names;
-      eval->getNames(&names);
-      if (std::find(names.begin(), names.end(), name) != names.end()) {
-        return callback(eval);
-      }
-    }
-    *err = Error("No such key %s", name.c_str());
-    return T();
-  }
-};
-
-class SubnetEvaluator : public CombinedEvaluator {
- public:
-  SubnetEvaluator(const std::string& layerName,
-                  std::unique_ptr<Evaluator>&& evaluator)
-      : layerName_(layerName) {
-    addEvaluator(std::move(evaluator));
-  }
-  void eval(const NeuralNetwork& nn) override {
-    const LayerPtr& layer = nn.getLayer(layerName_);
-    CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
-                 << nn.getName();
-    bool accessed = false;
-    layer->accessSubNetwork([this, &accessed](NeuralNetwork& subnet) {
-      subnet.eval(evaluators_[0].get());
-      accessed = true;
-    });
-    CHECK(accessed) << "There is no subnetwork for layer " << layerName_
-                    << " in submodel " << nn.getName();
-  }
-
- protected:
-  std::string layerName_;
-};
-
-Evaluator* NeuralNetwork::makeEvaluator() const {
-  CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
-  auto subModelConfig = std::find_if(config_.sub_models().begin(),
-                                     config_.sub_models().end(),
-                                     [=](const SubModelConfig& sub_model) {
-                                       return sub_model.name() == subModelName_;
-                                     });
-  bool useSubModel = (subModelConfig != config_.sub_models().end());
-  CHECK_EQ(useSubModel, !subModelName_.empty());
-  if (useSubModel) {
-    // create the evaluators that belong to CURRENT submodel
-    for (int i = 0; i < subModelConfig->evaluator_names_size(); ++i) {
-      // find evaluator by name
-      auto thisEvalConfig = std::find_if(
-          config_.evaluators().begin(),
-          config_.evaluators().end(),
-          [=](const EvaluatorConfig& ecfg) {
-            return ecfg.name() == subModelConfig->evaluator_names(i);
-          });
-      bool validConfig = (thisEvalConfig != config_.evaluators().end());
-      if (validConfig) {
-        std::unique_ptr<Evaluator> evaluator(
-            Evaluator::create(*thisEvalConfig));
-        combinedEvaluator->addEvaluator(std::move(evaluator));
-      }
-    }
-    for (auto& layer : layers_) {
-      layer->accessSubNetwork(
-          [layer, combinedEvaluator](NeuralNetwork& subnet) {
-            std::unique_ptr<Evaluator> subEvaluator(new SubnetEvaluator(
-                layer->getName(),
-                std::unique_ptr<Evaluator>(subnet.makeEvaluator())));
-            combinedEvaluator->addEvaluator(std::move(subEvaluator));
-          });
-    }
-  } else {
-    for (const EvaluatorConfig& evalConfig : config_.evaluators()) {
-      std::unique_ptr<Evaluator> evaluator(Evaluator::create(evalConfig));
-      combinedEvaluator->addEvaluator(std::move(evaluator));
-    }
-  }
-  return combinedEvaluator;
-}
-
-void NeuralNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
-
-#endif
-
-void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
-  CHECK_GE(outputLayers_.size(), args.size());
-  for (size_t i = 0; i < args.size(); ++i) {
-    outputLayers_[i]->getOutput().grad = args[i].grad;
-  }
-}
-
-extern NeuralNetwork* newCustomNerualNetwork(const std::string& name,
-                                             NeuralNetwork* network)
-    __attribute__((weak));
-
-NeuralNetwork* NeuralNetwork::newNeuralNetwork(const std::string& name,
-                                               NeuralNetwork* rootNetwork) {
-  if (newCustomNerualNetwork) {
-    return newCustomNerualNetwork(name, rootNetwork);
-  } else {
-    return new NeuralNetwork(name, rootNetwork);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
deleted file mode 100644
index 3e5615c8f0b30ab1283d41e025496051869289dc..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <map>
-#include <memory>
-
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/gserver/layers/CostLayer.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/Layer.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/utils/ClassRegistrar.h"
-
-namespace paddle {
-/*
- * @brief  Init function for the parameters.
- * @param paramId: the id of the parameter to init.
- * @param para: the pointer to the parameter to init.
- * @param sharedParams: the pointer to an array of the parameter to be shared.
- *                      If it is null, no parameter sharing is used.
- *                      Only CPU paramters can be shared.
- * It handles CPU, CPU sparse, CPU sparse remote,
- * and GPU parameters differently. If the type
- * of a parameter is NORMAL. Basically nothing need to be done.
- * CPU value: NORMAL.
- * CPU param: NORMAL.
- *
- * CPU sparse value: NORMAL.
- * CPU sparse gradient: MAT_SPARSE_ROW_AUTO_GROW.
- *
- * CPU sparse remote value: MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE).
- * CPU sparse remote gradient: MAT_SPARSE_ROW (!sharedParams)
- *                             MAT_SPARSE_ROW_AUTO_GROW (sharedParams)
- *
- * GPU value: NORMAL
- * GPU param: NORMAL
- */
-void parameterInitNN(int paramId,
-                     Parameter* para,
-                     std::vector<ParameterPtr>* sharedParams);
-
-class NeuralNetwork : public GradientMachine {
- public:
-  virtual void init(const ModelConfig& config,
-                    ParamInitCallback callback = nullptr,
-                    const std::vector<ParameterType>& parameterTypes =
-                        std::vector<ParameterType>{PARAMETER_VALUE,
-                                                   PARAMETER_GRADIENT,
-                                                   PARAMETER_MOMENTUM},
-                    bool useGpu = FLAGS_use_gpu);
-
-  /**
-   * Connect two submodels and
-   * down-submodel's output become up-submodel's input.
-   * By default, connection is one by one,
-   * If the agent height is smaller than real layer, *height* has to be filled.
-   *
-   * @param realLayer  The down-submodel's output layer.
-   * @param agentLayer The up-submodel's input agent layer.
-   */
-  static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
-  void connect(std::string agentLayerName,
-               NeuralNetwork* srcNN,
-               std::string realLayerName);
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  virtual Argument getLayerOutput(const std::string& layerName);
-
-  const LayerPtr& getLayer(const std::string& layerName) const {
-    auto it = layerMap_.find(layerName);
-    CHECK(it != layerMap_.end()) << "Unknown layer " << layerName;
-    return it->second;
-  }
-
-  virtual void onPassEnd();
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  virtual Evaluator* makeEvaluator() const;
-
-  virtual void eval(Evaluator* evaluator) const;
-#endif
-
-  virtual void resetState();
-  virtual void setOutputGrad(const std::vector<Argument>& args);
-
-  /// set machine state
-  virtual void setState(const MachineState& machineState);
-
-  /// get machine state
-  virtual void getState(MachineState& machineState);
-
-  static NeuralNetwork* create(const ModelConfig& config);
-
-  ParameterMap* getParameterMap() { return &parameterMap_; }
-
-  /**
-   * @brief Access each layer as a for each loop.
-   * @param callback invoke with each layer.
-   */
-  template <typename T>
-  void forEachLayer(T callback) {
-    for (auto& l : layers_) {
-      if (callback(l)) {
-        break;
-      }
-    }
-  }
-
-  static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
-                                         NeuralNetwork* rootNetwork = nullptr);
-
-  const std::string& getName() const { return subModelName_; }
-
-  /// some finish work, like convert the weight format of MKLDNNLayers
-  void finish();
-
-  /**
-   * @brief   Release the middle layer's output memory.
-   *
-   * @note    This function is used for memory optimization in inference.
-   */
-  void releaseOutput();
-
- protected:
-  /**
-   * The constructor of NeuralNetwork.
-   * The sub networks can get parameters_ and parameterMap_
-   * from base NeuralNetwork.
-   *
-   * @param subModelName The name of sub-model.
-   * @param rootNetwork  It used in MultiNetwork.
-   */
-  NeuralNetwork(std::string subModelName = "",
-                NeuralNetwork* rootNetwork = nullptr)
-      : subModelName_(subModelName), rootNetwork_(rootNetwork) {}
-
-  std::string subModelName_;
-  ModelConfig config_;
-  std::vector<LayerPtr> layers_;
-  ParameterMap parameterMap_;
-  LayerMap layerMap_;
-
-  std::vector<DataLayerPtr> dataLayers_;
-  std::vector<LayerPtr> outputLayers_;
-  std::vector<LayerPtr> middleLayers_;
-
-  static std::map<std::string, bool> dllInitMap;
-
-  NeuralNetwork* rootNetwork_;
-
-  /// Whether parameter of this NN is initialized by its own
-  /// (i.e., not by callback supplied with the caller)
-  bool paramSelfInited_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
deleted file mode 100644
index 73ac8cda721f200c1a02cd9c1d9456df70d7b7d2..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ /dev/null
@@ -1,1501 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RecurrentGradientMachine.h"
-#include <dlfcn.h>
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <limits>
-#include "NeuralNetwork.h"
-#include "paddle/gserver/layers/AgentLayer.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
-
-static const char* DIY_CALC_PROB_SYMBOL_NAME = "calc_prob";
-static const char* DIY_START_CALC_PROB_SYMBOL_NAME = "start_calc_prob";
-static const char* DIY_FINISH_CALC_PROB_SYMBOL_NAME = "finish_calc_prob";
-
-namespace paddle {
-
-/**
- * Start Custom Calculate Probability callback type.
- *
- * @param nNode, nodes: the path will be explored. nNodes is array size.
- *                      nodes is array elements.
- *
- * @return: A custom handler id that will passed to another callback.
- */
-typedef int (*DiyStartCalcProbCallback)(size_t nNodes, int* nodes);
-
-/**
- * Doing Custom Calculation of Probability callback type.
- *
- * @param handler: User custom handler. The return value from start calc prob.
- * @param nNode, nodes: Array. The current path.
- * @param curProb: The current log probability that neural network returns.
- *
- * @return: Log probability which user calculated, it will be updated to this
- *          path.
- * @NOTE: Return -INFINITY will DROP this path IMMEDIATELY!!
- */
-typedef real (*DiyCalcProbCallback)(
-    int handler, size_t nNodes, int* nodes, real curProb, bool atEos);
-
-/**
- * Finish Custom Calculation of Probability callback type.
- *
- * @param handler: User custom handler. The return value from start calc prob.
- */
-typedef void (*DiyStopCalcProbCallback)(int handler);
-
-static DiyCalcProbCallback gDiyProbMethod = nullptr;
-static DiyStartCalcProbCallback gDiyProbStart = nullptr;
-static DiyStopCalcProbCallback gDiyProbStop = nullptr;
-static void* gDiyProbHandle = nullptr;
-
-static void exit_diy_prob() { dlclose(gDiyProbHandle); }
-
-template <typename SymbolType>
-static inline SymbolType loadDiySymbol(const char* symbolName) {
-  void* sym = dlsym(gDiyProbHandle, symbolName);
-  CHECK(sym) << "Cannot load symbol " << symbolName << " from "
-             << FLAGS_diy_beam_search_prob_so;
-  return reinterpret_cast<SymbolType>(sym);
-}
-
-static InitFunction __init__diy_prob_method(
-    [] {
-      std::string soName = FLAGS_diy_beam_search_prob_so;
-      if (!soName.empty()) {
-        gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
-        CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
-        atexit(exit_diy_prob);
-        gDiyProbMethod =
-            loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
-        gDiyProbStart = loadDiySymbol<decltype(gDiyProbStart)>(
-            DIY_START_CALC_PROB_SYMBOL_NAME);
-        gDiyProbStop = loadDiySymbol<decltype(gDiyProbStop)>(
-            DIY_FINISH_CALC_PROB_SYMBOL_NAME);
-      }
-    },
-    std::numeric_limits<int>::max());
-
-class BeamSearchControlCallbacks {
- public:
-  RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback
-      beamSearchCandidateAdjust;
-  RecurrentGradientMachine::NormOrDropNodeCallback normOrDropNode;
-  RecurrentGradientMachine::DropCallback stopDetermineCandidates;
-
-  //! for gcc46 aggregate initialization is not very well, so we need to
-  //! explicit
-  BeamSearchControlCallbacks(
-      const RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback&
-          candidateAdjust,
-      const RecurrentGradientMachine::NormOrDropNodeCallback& norm,
-      const RecurrentGradientMachine::DropCallback& stop)
-      : beamSearchCandidateAdjust(candidateAdjust),
-        normOrDropNode(norm),
-        stopDetermineCandidates(stop) {}
-};
-
-class BeamSearchStatisticsCallbacks {
- public:
-  RecurrentGradientMachine::EachStepCallback onEachStepStarted;
-  RecurrentGradientMachine::EachStepCallback onEachStepStoped;
-
-  BeamSearchStatisticsCallbacks(
-      const RecurrentGradientMachine::EachStepCallback& start,
-      const RecurrentGradientMachine::EachStepCallback& stop)
-      : onEachStepStarted(start), onEachStepStoped(stop) {}
-};
-
-RecurrentGradientMachine::RecurrentGradientMachine(
-    const std::string& subModelName, NeuralNetwork* rootNetwork)
-    : NeuralNetwork(subModelName),
-      rootNetwork_(rootNetwork),
-      beamSearchCtrlCallbacks_(nullptr),
-      beamSearchStatistics_(nullptr) {
-  CHECK(!subModelName_.empty());
-}
-
-/**
- * bias layer, as input of memory frame 0 will give vector of zeros
- * if bias parameter is not set.
- *
- * boot bias layer create directly in recurrent gradient machine, because:
- *
- * 1. It is only one frame, so it should not be placed in layer group,
- *    which is one instance for every one frame.
- *
- * 2. It is no input layer, so it need resetHeight() before forward(),
- *    and resetHeight() must be called in recurrent gradient machine,
- *    so it's should not be placed in root network.
- */
-class BootBiasLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-  IVectorPtr cpuIds_;
-
- public:
-  explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    if (!Layer::init(layerMap, parameterMap)) return false;
-
-    if (biasParameter_) {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-    }
-    return true;
-  }
-
-  void resetHeight(int height) {
-    if (config_.has_bos_id()) {  // used as a constant id layerConfig
-      IVector::resizeOrCreate(output_.ids, height, useGpu_);
-      output_.ids->reset((int)config_.bos_id());
-    } else {
-      resetOutput(height, getSize());
-    }
-  }
-
-  void forward(PassType passType) override {
-    if (biases_) {
-      MatrixPtr outV = getOutputValue();
-      outV->addBias(*(biases_->getW()), 1);
-      forwardActivation();
-    }
-  }
-
-  void backward(const UpdateCallback& callback) override {
-    if (biases_ && biases_->getWGrad()) {
-      backwardActivation();
-      biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-      biases_->getParameterPtr()->incUpdate(callback);
-    }
-  }
-};
-
-void RecurrentGradientMachine::init(
-    const ModelConfig& config,
-    ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes,
-    bool useGpu) {
-  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
-  useGpu_ = useGpu;
-
-  auto subModelConfig =
-      std::find_if(config.sub_models().begin(),
-                   config.sub_models().end(),
-                   [this](const SubModelConfig& sub_model) {
-                     return sub_model.name() == this->subModelName_;
-                   });
-  CHECK(subModelConfig != config.sub_models().end());
-  reversed_ = subModelConfig->reversed();
-  generating_ = subModelConfig->has_generator();
-
-  inFrameLines_.resize(subModelConfig->in_links_size());
-  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
-    inFrameLines_[i].linkName = subModelConfig->in_links(i).link_name();
-    inFrameLines_[i].inLayer =
-        rootNetwork_->getLayer(subModelConfig->in_links(i).layer_name());
-  }
-
-  outFrameLines_.resize(subModelConfig->out_links_size());
-  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
-    auto& linkPair = subModelConfig->out_links(i);
-    outFrameLines_[i].layerName = linkPair.layer_name();
-    outFrameLines_[i].agentLayer = rootNetwork_->getLayer(linkPair.link_name());
-  }
-
-  memoryFrameLines_.resize(subModelConfig->memories_size());
-  for (size_t i = 0; i < memoryFrameLines_.size(); ++i) {
-    auto& memoryConfig = subModelConfig->memories(i);
-    memoryFrameLines_[i].layerName = memoryConfig.layer_name();
-    memoryFrameLines_[i].linkName = memoryConfig.link_name();
-    auto agentConfig =
-        std::find_if(config.layers().begin(),
-                     config.layers().end(),
-                     [&memoryConfig](const LayerConfig& layerConfig) {
-                       return layerConfig.name() == memoryConfig.link_name();
-                     });
-    CHECK(agentConfig != config.layers().end());
-    if (memoryConfig.has_boot_layer_name()) {
-      memoryFrameLines_[i].rootLayer =
-          rootNetwork_->getLayer(memoryConfig.boot_layer_name());
-
-      LayerConfig scatterConfig = *agentConfig;
-      memoryFrameLines_[i].rootAgent.reset(
-          new ScatterAgentLayer(scatterConfig));
-      memoryFrameLines_[i].rootAgent->init(LayerMap(), parameterMap_);
-
-      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].rootAgent;
-    } else {
-      LayerConfig biasConfig = *agentConfig;
-      if (memoryConfig.has_boot_bias_parameter_name()) {
-        biasConfig.set_bias_parameter_name(
-            memoryConfig.boot_bias_parameter_name());
-        biasConfig.set_active_type(memoryConfig.boot_bias_active_type());
-      } else if (memoryConfig.has_boot_with_const_id()) {
-        biasConfig.set_bos_id(memoryConfig.boot_with_const_id());
-      }
-      memoryFrameLines_[i].biasLayer.reset(new BootBiasLayer(biasConfig));
-      memoryFrameLines_[i].biasLayer->init(LayerMap(), parameterMap_);
-
-      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].biasLayer;
-    }
-
-    if (subModelConfig->has_generator()) {
-      memoryFrameLines_[i].scatterAgents.resize(2);
-      for (auto& agent : memoryFrameLines_[i].scatterAgents) {
-        agent.reset(new ScatterAgentLayer(*agentConfig));
-        agent->init(LayerMap(), parameterMap_);
-      }
-    }
-  }
-
-  if (subModelConfig->has_generator()) {
-    generator_.config = subModelConfig->generator();
-    eosFrameLine_.reset(new EosFrameLine);
-    maxSequenceLength_ = generator_.config.max_num_frames();
-  }
-
-  // get parameters actually used by this Layer Group
-  resizeOrCreateFrames(1);
-  for (auto& para : frames_[0]->getParameters()) {
-    if (para->getSharedCount() > 0) {
-      parameterIds_.push_back(para->getID());
-    }
-  }
-  for (auto& para : parameters_) {  // bias layer parameters
-    if (para->getSharedCount() > 0) {
-      parameterIds_.push_back(para->getID());
-    }
-  }
-}
-
-void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
-  if ((size_t)numFrames <= frames_.size()) {
-    return;
-  }
-
-  frames_.reserve(numFrames);
-  for (auto& inFrameLine : inFrameLines_) {
-    inFrameLine.agents.reserve(numFrames);
-  }
-  for (auto& outFrameLine : outFrameLines_) {
-    outFrameLine.frames.reserve(numFrames);
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.frames.reserve(numFrames);
-    memoryFrameLine.agents.reserve(numFrames);
-  }
-  if (eosFrameLine_) {
-    eosFrameLine_->layers.reserve(numFrames);
-  }
-
-  ParamInitCallback subParamInitCb = [this](int paramId, Parameter* para) {
-    para->enableSharedType(PARAMETER_VALUE,
-                           this->parameters_[paramId]->getBuf(PARAMETER_VALUE),
-                           this->parameters_[paramId]->getMat(PARAMETER_VALUE));
-    para->enableSharedType(
-        PARAMETER_GRADIENT,
-        this->parameters_[paramId]->getBuf(PARAMETER_GRADIENT),
-        this->parameters_[paramId]->getMat(PARAMETER_GRADIENT));
-  };
-
-  for (int i = frames_.size(); i < numFrames; ++i) {
-    std::unique_ptr<NeuralNetwork> frame(
-        NeuralNetwork::newNeuralNetwork(subModelName_));
-    frame->init(config_, subParamInitCb);
-
-    for (auto& inFrameLine : inFrameLines_) {
-      inFrameLine.agents.push_back(frame->getLayer(inFrameLine.linkName));
-    }
-
-    for (auto& outFrameLine : outFrameLines_) {
-      outFrameLine.frames.push_back(frame->getLayer(outFrameLine.layerName));
-    }
-    for (auto& memoryFrameLine : memoryFrameLines_) {
-      memoryFrameLine.frames.push_back(
-          frame->getLayer(memoryFrameLine.layerName));
-      memoryFrameLine.agents.push_back(
-          frame->getLayer(memoryFrameLine.linkName));
-    }
-    if (eosFrameLine_) {
-      eosFrameLine_->layers.push_back(
-          frame->getLayer(generator_.config.eos_layer_name()));
-    }
-
-    frames_.emplace_back(std::move(frame));
-  }
-}
-
-void RecurrentGradientMachine::resizeBootFrame(int numSequences) {
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.biasLayer) {
-      auto biasLayer =
-          dynamic_cast<BootBiasLayer*>(memoryFrameLine.biasLayer.get());
-      CHECK_NOTNULL(biasLayer);
-      biasLayer->resetHeight(numSequences);
-    } else {  // check input root layer height
-      CHECK_EQ(numSequences,
-               memoryFrameLine.rootLayer->getOutput().getNumSequences());
-    }
-  }
-}
-
-void RecurrentGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
-  LOG(FATAL) << "should not use this function";
-}
-
-void RecurrentGradientMachine::checkInputConsistency(
-    int inlinkId, const std::vector<Argument::SeqInfo>& seqInfo) {
-  if (commonSeqInfo_.empty()) {
-    commonSeqInfo_.resize(seqInfo.size());
-    for (size_t i = 0; i < seqInfo.size(); ++i) {
-      commonSeqInfo_[i].topLevelLength = seqInfo[i].topLevelLength;
-      commonSeqInfo_[i].seqId = seqInfo[i].seqId;
-    }
-  } else {
-    CHECK_EQ(commonSeqInfo_.size(), seqInfo.size())
-        << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-        << " has mismatched number of sequences";
-    for (size_t i = 0; i < seqInfo.size(); ++i) {
-      CHECK_EQ(commonSeqInfo_[i].topLevelLength, seqInfo[i].topLevelLength)
-          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-          << " has mismatched sequence length";
-      CHECK_EQ(commonSeqInfo_[i].seqId, seqInfo[i].seqId)
-          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-          << " has mismatched sequence length";
-    }
-  }
-}
-
-void RecurrentGradientMachine::calcNumSequencesAtEachStep() {
-  int numSequences = commonSeqInfo_.size();
-  numSeqs_.resize(maxSequenceLength_);
-  for (int i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < commonSeqInfo_[i].topLevelLength; ++j) {
-      numSeqs_[j] = i + 1;
-    }
-  }
-}
-
-void RecurrentGradientMachine::reorganizeInput(PassType passType) {
-  info_.clear();
-  info_.resize(inFrameLines_.size());
-
-  commonSeqInfo_.clear();
-  seqInfos_.clear();
-  seqInfos_.resize(inFrameLines_.size());
-
-  for (size_t i = 0; i < inFrameLines_.size(); i++) {
-    const Argument& input = inFrameLines_[i].inLayer->getOutput();
-    if (!input.hasSeq()) {
-      continue;
-    }
-    input.getSeqInfo(&seqInfos_[i]);
-    checkInputConsistency(i, seqInfos_[i]);
-  }
-  CHECK(!commonSeqInfo_.empty())
-      << "At least one input needs to be sequence or subsequence";
-  maxSequenceLength_ = commonSeqInfo_[0].topLevelLength;
-
-  calcNumSequencesAtEachStep();
-
-  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
-    const Argument& input = inFrameLines_[i].inLayer->getOutput();
-    if (!input.hasSeq()) {
-      seqInfos_[i] = commonSeqInfo_;
-    }
-    createInFrameInfo(i, input, passType);
-  }
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-
-    // inFrameLine select rows in real layer one time
-    for (size_t i = 0; i < inFrameLines_.size(); i++) {
-      selectRowsOneTime(inFrameLines_[i].inLayer,
-                        info_[i].allIds,
-                        &(inFrameLines_[i].outArg),
-                        passType);
-    }
-  }
-}
-
-void RecurrentGradientMachine::reorganizeOutput(PassType passType) {
-  calcSequenceStartPositions();
-  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
-    Info info;
-    auto& outFrameLine = outFrameLines_[i];
-    ICpuGpuVectorPtr sequenceStartPositions;
-    ICpuGpuVectorPtr subSequenceStartPositions;
-    createOutFrameInfo(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-    auto gatherAgent =
-        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-    CHECK_NOTNULL(gatherAgent);
-    gatherAgent->copyIdAndSequenceInfo(sequenceStartPositions,
-                                       subSequenceStartPositions,
-                                       info.allIds,
-                                       info.idIndex);
-  }
-}
-
-void RecurrentGradientMachine::connectFrames(PassType passType) {
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.rootAgent) {
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
-      createMemoryFrameInfo(&memoryFrameLine, passType);
-      scatterAgent->setRealLayerAndOutput(memoryFrameLine.rootLayer,
-                                          memoryFrameLine.outArg,
-                                          memoryFrameLine.allIds,
-                                          /* idIndex */ 0,
-                                          memoryFrameLine.allIds->getSize(),
-                                          /* handleBackward */ true);
-      if (memoryFrameLine.sequenceStartPositions) {
-        int size = memoryFrameLine.sequenceStartPositions->getSize();
-        scatterAgent->setSequenceStartPositions(
-            memoryFrameLine.sequenceStartPositions,
-            /* seqStartPosIndex */ 0,
-            size);
-      }
-    }
-  }
-
-  for (auto& outFrameLine : outFrameLines_) {
-    auto gatherAgent =
-        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-    gatherAgent->clearRealLayers();
-  }
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    // connect in_links
-    for (size_t j = 0; j < inFrameLines_.size(); ++j) {
-      Info& info = info_[j];
-      // idSize denotes the sum number of tokens in each length i
-      int idIndex = info.idIndex.empty() ? 0 : info.idIndex[i];
-      int idSize = info.idIndex.empty() ? numSeqs_[i]
-                                        : info.idIndex[i + 1] - info.idIndex[i];
-      InFrameLine inFrameLine = inFrameLines_[j];
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
-      scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
-                                          inFrameLine.outArg,
-                                          info.allIds,
-                                          idIndex,
-                                          idSize,
-                                          i == 0);
-      if (info.sequenceStartPositions) {
-        // size: the length of subsequence
-        int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
-        scatterAgent->setSequenceStartPositions(
-            info.sequenceStartPositions, info.seqStartPosIndex[i], size);
-      }
-    }
-
-    // connect out_links
-    for (auto& outFrameLine : outFrameLines_) {
-      auto gatherAgent =
-          dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-      gatherAgent->addRealLayer(outFrameLine.frames[i]);
-    }
-    for (auto& memoryFrameLine : memoryFrameLines_) {
-      NeuralNetwork::connect(
-          memoryFrameLine.agents[i],
-          i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1],
-          numSeqs_[i] /*height of agent*/);
-    }
-  }
-}
-
-void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
-                                       std::vector<Argument>* outArgs,
-                                       PassType passType) {
-  /* inArgs and outArgs are not used.
-     The inputs are inFrameLines_[i].inLayer.
-     The outputs are outFramesLines_[i].agentLayer
-   */
-
-  if (generating_) {
-    generateSequence();
-    return;
-  }  // else forward..
-
-  reorganizeInput(passType);
-  int numSequences = commonSeqInfo_.size();
-
-  resizeOrCreateFrames(maxSequenceLength_);
-  resizeBootFrame(numSequences);
-
-  connectFrames(passType);
-
-  REGISTER_TIMER_INFO("RecurrentFwTime", "RecurrentFwTime");
-  // forward
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->forward(passType);
-  }
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    frames_[i]->forward(inArgs, &outArgs, passType);
-  }
-
-  reorganizeOutput(passType);
-}
-
-void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
-  if (generating_) {
-    return;
-  }
-  REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime");
-  AsyncGpuBlock asyncGpuBlock;
-  for (int i = maxSequenceLength_ - 1; i >= 0; --i) {
-    frames_[i]->backward(nullptr);
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->backward(nullptr);
-  }
-}
-
-void RecurrentGradientMachine::forwardBackward(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType,
-    const UpdateCallback& callback) {
-  LOG(FATAL) << "should not use this function";
-}
-
-void RecurrentGradientMachine::eval(Evaluator* evaluator) const {
-  // call printers frame by frame
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    VLOG(2) << "Recurrent Layer Group eval frame " << i << " begin";
-    evaluator->eval(*(frames_[i].get()));
-    VLOG(2) << "Recurrent Layer Group eval frame " << i << " end";
-  }
-}
-
-void RecurrentGradientMachine::registerBeamSearchControlCallbacks(
-    const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
-    const NormOrDropNodeCallback& normOrDropNode,
-    const DropCallback& stopBeamSearch) {
-  this->removeBeamSearchControlCallbacks();
-  //! for gcc 46, aggregate initialization is not supported. TAT
-  this->beamSearchCtrlCallbacks_ = new BeamSearchControlCallbacks(
-      adjustBeamSearch, normOrDropNode, stopBeamSearch);
-}
-
-void RecurrentGradientMachine::removeBeamSearchControlCallbacks() {
-  if (this->beamSearchCtrlCallbacks_) {
-    delete this->beamSearchCtrlCallbacks_;
-    this->beamSearchCtrlCallbacks_ = nullptr;
-  }
-}
-
-void RecurrentGradientMachine::registerBeamSearchStatisticsCallbacks(
-    const EachStepCallback& onEachStepStarted,
-    const EachStepCallback& onEachStepStoped) {
-  this->removeBeamSearchStatisticsCallbacks();
-  this->beamSearchStatistics_ =
-      new BeamSearchStatisticsCallbacks(onEachStepStarted, onEachStepStoped);
-}
-
-void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
-  if (this->beamSearchStatistics_) {
-    delete this->beamSearchStatistics_;
-    this->beamSearchStatistics_ = nullptr;
-  }
-}
-
-namespace {
-void lenToStarts(std::vector<int>& starts) {
-  int pos = 0;
-  starts.back() = 0;
-  for (auto& start : starts) {
-    int tmp = start;
-    start = pos;
-    pos += tmp;
-  }
-  starts.back() = pos;
-}
-}  // namespace
-
-void RecurrentGradientMachine::calcSequenceStartPositions() {
-  std::vector<int> starts(commonSeqInfo_.size() + 1);
-  for (auto& seqInfo : commonSeqInfo_) {
-    starts[seqInfo.seqId] = seqInfo.topLevelLength;
-  }
-  lenToStarts(starts);
-  ICpuGpuVector::resizeOrCreate(sequenceStartPositions_, starts.size(), false);
-  std::copy(starts.begin(),
-            starts.end(),
-            sequenceStartPositions_->getMutableData(false));
-}
-
-void RecurrentGradientMachine::checkOutputConsistency(
-    OutFrameLine& outFrameLine) {
-  bool hasSeq = outFrameLine.frames[0]->getOutput().hasSeq();
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    CHECK_EQ(hasSeq, frame->getOutput().hasSeq());
-    int numSequences = frame->getOutput().getNumSequences();
-    CHECK_EQ(numSeqs_[i], numSequences);
-  }
-}
-
-void RecurrentGradientMachine::createOutFrameInfo(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  checkOutputConsistency(outFrameLine);
-
-  if (!outFrameLine.frames[0]->getOutput().hasSeq()) {
-    createOutFrameInfo_seq(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-  } else {
-    createOutFrameInfo_subseq(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-  }
-}
-
-void RecurrentGradientMachine::createOutFrameInfo_seq(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  std::vector<int> allIds;
-  info.idIndex.resize(1, 0);  // first idIndex = 0
-
-  const int* starts = sequenceStartPositions_->getData(false);
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    for (size_t j = 0; j < numSequences; ++j) {
-      int seqStart = starts[commonSeqInfo_[j].seqId];
-      int seqLength = commonSeqInfo_[j].topLevelLength;
-      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
-                                 : (seqStart + i));
-    }
-    info.idIndex.push_back(allIds.size());
-  }
-  sequenceStartPositions = sequenceStartPositions_;
-  copyScattedId(allIds, &info.allIds, allIds.size());
-  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-void RecurrentGradientMachine::createOutFrameInfo_subseq(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  size_t numSequences = commonSeqInfo_.size();
-  std::vector<int> allIds;
-  info.idIndex.resize(1, 0);  // first idIndex = 0
-
-  const int* starts = sequenceStartPositions_->getData(false);
-  std::vector<int> subStarts(starts[numSequences] + 1);
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    const int* seqStarts =
-        frame->getOutput().sequenceStartPositions->getData(false);
-    for (size_t j = 0; j < numSequences; ++j) {
-      subStarts[starts[commonSeqInfo_[j].seqId] + i] =
-          seqStarts[j + 1] - seqStarts[j];
-    }
-  }
-  lenToStarts(subStarts);
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    for (size_t j = 0; j < numSequences; ++j) {
-      int pos = starts[commonSeqInfo_[j].seqId] + i;
-      int subSeqStart = subStarts[pos];
-      int subSeqEnd = subStarts[pos + 1];
-      for (int k = subSeqStart; k < subSeqEnd; ++k) {
-        allIds.push_back(k);
-      }
-    }
-    info.idIndex.push_back(allIds.size());
-  }
-
-  ICpuGpuVector::resizeOrCreate(
-      subSequenceStartPositions, subStarts.size(), false);
-  int* cpuSubSequenceStartPositions =
-      subSequenceStartPositions->getMutableData(false);
-  std::copy(subStarts.begin(), subStarts.end(), cpuSubSequenceStartPositions);
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  int* cpuSequenceStartPositions =
-      sequenceStartPositions->getMutableData(false);
-  for (size_t i = 0; i <= numSequences; ++i) {
-    cpuSequenceStartPositions[i] = subStarts[starts[i]];
-  }
-  copyScattedId(allIds, &info.allIds, allIds.size());
-  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-/* create scattered id infomation for all realLayer of inFrameLines one time.
- * If hasSubseq, will also create scattered sequenceStartPositions infomation
- * for all realLayer of inFrameLines one time.
- */
-void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
-                                                 const Argument& input,
-                                                 PassType passType) {
-  if (!input.hasSeq()) {
-    createInFrameInfo_nonseq(inlinkId, input, passType);
-  } else if (!input.hasSubseq()) {
-    createInFrameInfo_seq(inlinkId, input, passType);
-  } else {
-    createInFrameInfo_subseq(inlinkId, input, passType);
-  }
-}
-
-void RecurrentGradientMachine::createInFrameInfo_nonseq(int inlinkId,
-                                                        const Argument& input,
-                                                        PassType passType) {
-  std::vector<int> allIds;
-
-  auto& seqInfo = seqInfos_[inlinkId];
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.clear();
-  for (size_t i = 0; i < seqInfo.size(); ++i) {
-    allIds.push_back(seqInfo[i].seqId);
-  }
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-}
-
-void RecurrentGradientMachine::createInFrameInfo_seq(int inlinkId,
-                                                     const Argument& input,
-                                                     PassType passType) {
-  std::vector<int> allIds;
-  auto& seqInfo = seqInfos_[inlinkId];
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    for (int j = 0; j < numSeqs_[i]; ++j) {
-      int seqLength = seqInfo[j].topLevelLength;
-      int seqStart = seqInfo[j].seqStart;
-      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
-                                 : (seqStart + i));
-    }
-    inlinkInfo->idIndex.push_back(allIds.size());
-  }
-
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-  CHECK_EQ(inlinkInfo->idIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-}
-void RecurrentGradientMachine::createInFrameInfo_subseq(int inlinkId,
-                                                        const Argument& input,
-                                                        PassType passType) {
-  std::vector<int> allIds;
-
-  auto& seqInfo = seqInfos_[inlinkId];
-
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
-  std::vector<int> sequenceStartPositions;
-  const int* subSequenceStartPositions = nullptr;
-
-  subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
-  inlinkInfo->seqStartPosIndex.clear();
-  inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    sequenceStartPositions.push_back(0);  // first element = 0
-    for (int j = 0; j < numSeqs_[i]; ++j) {
-      int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
-      int subSeqEnd = subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
-      for (int k = subSeqStart; k < subSeqEnd; ++k) {
-        allIds.push_back(k);
-      }
-      sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                       subSeqEnd - subSeqStart);
-    }
-    inlinkInfo->idIndex.push_back(allIds.size());
-    inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
-  }
-  // inFrameLine create sequenceStartPositions one time
-  CHECK_EQ(
-      sequenceStartPositions.size(),
-      static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
-  CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-  createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
-
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-  CHECK_EQ(inlinkInfo->idIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-/* like createInFrameInfo, but for all realLayer of memoryFrameLines*/
-void RecurrentGradientMachine::createMemoryFrameInfo(
-    MemoryFrameLine* memoryFrameLine, PassType passType) {
-  const Argument& input = (*memoryFrameLine).rootLayer->getOutput();
-  size_t numSequences = input.getNumSequences();
-  std::vector<int> allIds;
-  bool seqFlag = input.hasSeq();
-  CHECK(!input.hasSubseq())
-      << "Subsequence boot layer for memory is not supported";
-
-  if (seqFlag) {  // for sequenceScatterAgentLayer
-    std::vector<int> sequenceStartPositions;
-    sequenceStartPositions.push_back(0);  // first element = 0
-    const int* starts = input.sequenceStartPositions->getData(false);
-    for (size_t i = 0; i < numSequences; ++i) {
-      // memory info adopt info of inlinks[0]
-      int seqId = seqInfos_[0][i].seqId;
-      for (int k = starts[seqId]; k < starts[seqId + 1]; ++k) {
-        allIds.push_back(k);
-      }
-      sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                       starts[seqId + 1] - starts[seqId]);
-    }
-    createSeqPos(sequenceStartPositions,
-                 &(*memoryFrameLine).sequenceStartPositions);
-
-  } else {  // for scatterAgentLayer
-    for (size_t i = 0; i < numSequences; ++i) {
-      allIds.push_back(seqInfos_[0][i].seqId);
-    }
-  }
-  // copy and check scatterId
-  copyScattedId(allIds, &(*memoryFrameLine).allIds, input.getBatchSize());
-  // memoryFrameLine select rows in real layer one time
-  selectRowsOneTime((*memoryFrameLine).rootLayer,
-                    (*memoryFrameLine).allIds,
-                    &(*memoryFrameLine).outArg,
-                    passType);
-}
-
-void RecurrentGradientMachine::copyScattedId(std::vector<int>& srcIds,
-                                             IVectorPtr* dstIds,
-                                             int size) {
-  int idSize = srcIds.size();
-  CHECK_EQ(idSize, size);
-  IVector::resizeOrCreate(*dstIds, idSize, useGpu_);
-  (*dstIds)->copyFrom(srcIds.data(), idSize);
-  // check
-  std::sort(srcIds.begin(), srcIds.end());
-  for (int i = 0; i < idSize; ++i) {
-    CHECK_EQ(srcIds[i], i);
-  }
-}
-
-void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
-                                                 const IVectorPtr& allIds,
-                                                 Argument* arg,
-                                                 PassType passType) {
-  Argument& src = layer->getOutput();
-  if (src.value) {
-    const MatrixPtr& realV = src.value;
-    int height = realV->getHeight();
-    int width = realV->getWidth();
-    Matrix::resizeOrCreate(
-        arg->value, height, width, /* trans */ false, useGpu_);
-    arg->value->zeroMem();
-    arg->value->selectRows(*realV, *allIds);
-    if (passType != PASS_TEST) {
-      Matrix::resizeOrCreate(
-          arg->grad, height, width, /* trans */ false, useGpu_);
-      arg->grad->zeroMem();
-    }
-  }
-  if (src.ids) {
-    IVector::resizeOrCreate(arg->ids, src.ids->getSize(), useGpu_);
-    arg->ids->selectFrom(*src.ids, *allIds);
-  }
-}
-
-void RecurrentGradientMachine::createSeqPos(
-    const std::vector<int>& sequenceStartPosition,
-    ICpuGpuVectorPtr* sequenceStartPositions) {
-  int size = sequenceStartPosition.size();
-  const int* data = sequenceStartPosition.data();
-  ICpuGpuVector::resizeOrCreate(*sequenceStartPositions, size, false);
-  (*sequenceStartPositions)->copyFrom(data, size, false);
-}
-
-size_t RecurrentGradientMachine::getGenBatchSize() {
-  size_t numSequences = 0;
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (!memoryFrameLine.rootLayer) continue;
-    Argument& bootArg = memoryFrameLine.rootLayer->getOutput();
-    size_t batchSize = bootArg.getNumSequences();
-    if (numSequences) {
-      CHECK_EQ(numSequences, batchSize);
-    } else {
-      numSequences = batchSize;
-    }
-  }
-  CHECK(numSequences)
-      << "Fail to get batch size in generation. "
-         "At least one of the Memory layer MUST have a layer that is NOT in "
-         "the layer group to boot it, and this boot layer is used to "
-         "decide batch_size in generation process.";
-  return numSequences;
-}
-
-void RecurrentGradientMachine::generateSequence() {
-  CHECK_NOTNULL(eosFrameLine_.get());
-  CHECK_GE(outFrameLines_.size(), 1UL);
-  size_t numSequences = getGenBatchSize();
-
-  resizeBootFrame(numSequences);
-  // We create only two sub-network in generation, one stores states of all
-  // layers in previous time step and the other storing the states at current
-  // time step.
-  resizeOrCreateFrames(2);
-
-  // outFrameLines_.size() > 1UL
-  dataArgsSize_ = outFrameLines_.size() - 1;
-  dataArgs_.resize(dataArgsSize_);
-  dataArgsFrame_.clear();
-  dataArgsFrame_.resize(dataArgsSize_);
-
-  // connect boot frame memory links
-  std::vector<int> ids(numSequences);
-  for (size_t i = 0; i < numSequences; ++i) {
-    ids[i] = i;
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.rootAgent) {
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
-      scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids);
-    }
-    NeuralNetwork::connect(
-        memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size());
-  }
-
-  // boot layer forward
-  AsyncGpuBlock asyncGpuBlock;
-
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->forward(PASS_TEST);
-  }
-
-  // init outArg
-  size_t resultNum = generator_.config.num_results_per_sample();
-  size_t maxGenWordCount =
-      generator_.config.max_num_frames() * numSequences * resultNum;
-  IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false);
-  if (resultNum > 1) {
-    CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
-    Matrix::resizeOrCreate(generator_.outArg.in,
-                           /* height */ numSequences,
-                           /* width */ resultNum,
-                           false,
-                           /* useGpu */ false);
-  }
-  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
-                                numSequences + 1,
-                                /* useGpu */ false);
-  if (getBeamSize() > 1) {
-    beamSearch(numSequences);
-  } else {
-    oneWaySearch(numSequences);
-  }
-  if (dataArgsSize_) createDataOutlink();
-
-  size_t size = generator_.ids.size();
-  generator_.outArg.ids->resize(size);
-  generator_.outArg.ids->copyFrom(generator_.ids.data(), size);
-
-  OutFrameLine& outFrameLine = outFrameLines_[0];
-  auto dataAgent = dynamic_cast<DataLayer*>(outFrameLine.agentLayer.get());
-  CHECK_NOTNULL(dataAgent);
-  dataAgent->setData(generator_.outArg);
-  dataAgent->prefetch();
-}
-
-void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
-  OutFrameLine& outFrameLine = outFrameLines_[0];
-
-  // finalPaths_[0] stores the generated results of the
-  // entire batch, so its size exactly equals to batchSize.
-  finalPaths_.clear();
-  finalPaths_.resize(1);
-  std::vector<Path>& finalPaths = finalPaths_[0];
-  finalPaths.resize(batchSize);
-
-  seqIds_.resize(batchSize);
-  std::vector<int> scatterIds;
-  for (size_t i = 0; i < batchSize; ++i) {
-    finalPaths[i].seqId = i;
-    seqIds_[i] = i;
-  }
-
-  // forward
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    if (i && scatterIds.empty()) break;
-    int machineCur = i % 2;
-    int machinePrev = (i - 1) % 2;
-    // connect memory links
-    if (i) {
-      seqIds_.clear();
-      for (size_t j = 0; j < batchSize; ++j) {
-        if (finalPaths[j].seqId != -1) seqIds_.push_back(j);
-      }
-
-      for (auto& memoryFrameLine : memoryFrameLines_) {
-        auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
-            memoryFrameLine.scatterAgents[machineCur].get());
-        scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                                   scatterIds);
-        scatterAgent->forward(PASS_TEST);
-        NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
-                               memoryFrameLine.scatterAgents[machineCur]);
-      }
-    }
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
-
-    const IVectorPtr& idVec = outFrameLine.frames[machineCur]->getOutput().ids;
-    for (size_t j = 0; j < seqIds_.size(); ++j) {
-      finalPaths[seqIds_[j]].ids.push_back(idVec->getElement(j));
-      finalPaths[seqIds_[j]].machineIdVec.push_back(j);
-    }
-
-    copyDataOutlinkFrame(machineCur);
-
-    // check eos
-    const IVectorPtr& eosVec =
-        eosFrameLine_->layers[machineCur]->getOutput().ids;
-    scatterIds.clear();
-    for (size_t j = 0; j < seqIds_.size(); ++j) {
-      if (eosVec->getElement(j) == 1U) {
-        // path.seqId = -1 indicates end of generation
-        // of an input sequence
-        finalPaths[seqIds_[j]].seqId = -1;
-      } else {
-        scatterIds.push_back(j);
-      }
-    }
-  }
-
-  batchMachineIdVec_.clear();
-  batchMachineStartPos_.clear();
-  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
-  starts[0] = 0;
-  generator_.ids.clear();
-  for (size_t i = 0; i < batchSize; ++i) {
-    generator_.ids.insert(generator_.ids.end(),
-                          finalPaths[i].ids.begin(),
-                          finalPaths[i].ids.end());
-    starts[i + 1] = generator_.ids.size();
-    batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-                              finalPaths[i].machineIdVec.begin(),
-                              finalPaths[i].machineIdVec.end());
-  }
-}
-
-void RecurrentGradientMachine::connectPrevFrame(int stepId,
-                                                std::vector<Path>& paths) {
-  int machineCur = stepId % 2;
-  int machinePrev = (stepId - 1) % 2;
-  int beam = getBeamSize();
-  machineIds_.clear();
-  topIds_.clear();
-  seqIds_.clear();
-
-  for (size_t j = 0; j < paths.size(); ++j) {
-    machineIds_.push_back(paths[j].machineId);
-    topIds_.push_back(paths[j].machineId * beam + paths[j].topIndex);
-    seqIds_.push_back(paths[j].seqId);
-  }
-
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    bool isOutIds = (memoryFrameLine.layerName == outFrameLines_[0].layerName);
-    auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
-        memoryFrameLine.scatterAgents[machineCur].get());
-    scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                               isOutIds ? topIds_ : machineIds_);
-    scatterAgent->forward(PASS_TEST);
-    NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
-                           memoryFrameLine.scatterAgents[machineCur]);
-  }
-}
-
-void RecurrentGradientMachine::forwardFrame(int machineCur) {
-  // forward
-  const std::vector<Argument> inArgs;
-  std::vector<Argument> outArgs;
-  frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
-
-  copyDataOutlinkFrame(machineCur);
-
-  IVectorPtr& ids = outFrameLines_[0].frames[machineCur]->getOutput().ids;
-  MatrixPtr in = outFrameLines_[0].frames[machineCur]->getOutput().in;
-  IVectorPtr& eos = eosFrameLine_->layers[machineCur]->getOutput().ids;
-  if (useGpu_) {
-    IVector::resizeOrCreate(cpuId_, ids->getSize(), false /* useGpu */);
-    cpuId_->copyFrom(*ids);
-    Matrix::resizeOrCreate(cpuProb_,
-                           in->getHeight(),
-                           in->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    cpuProb_->copyFrom(*in);
-    IVector::resizeOrCreate(cpuEos_, eos->getSize(), false /* useGpu */);
-    cpuEos_->copyFrom(*eos);
-  } else {
-    cpuId_ = ids;
-    cpuProb_ = in;
-    cpuEos_ = eos;
-  }
-}
-
-void RecurrentGradientMachine::singlePathExpand(Path& curPath,
-                                                size_t curPathId,
-                                                std::vector<Path>& newPaths,
-                                                size_t expandWidth) {
-  int calc_id =
-      gDiyProbStart ? gDiyProbStart(curPath.ids.size(), curPath.ids.data()) : 0;
-
-  const int* idVec = cpuId_->getData();
-  const real* probMat = cpuProb_->getData();
-  const int* eosVec = cpuEos_->getData();
-
-  for (size_t k = 0; k < expandWidth; k++) {
-    int index = curPathId * expandWidth + k;
-    int id = idVec[index];
-    real prob = probMat[index];
-    /*
-     * Ordinarily, beam search greedily expands the most promising expandWidth
-     * paths that currently are ALWAYS returned by MaxIdLayer.
-     * In one condition, if user customizes the beam search procedure by
-     * restricting the expansion within a user defined subset,
-     * as a result, MaxIdLayer possibly COULD NOT return expandWidth
-     * vaild expansions, and it will use -1 to indicate the end of valid
-     * expansion candidates.
-     */
-    if (id == -1) break;
-
-    real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob;
-    Path newPath(
-        curPath, id, newLogProb, curPathId /*machineId*/, k /*topIndex*/);
-    if (this->beamSearchCtrlCallbacks_) {
-      if (beamSearchCtrlCallbacks_->stopDetermineCandidates(
-              newPath.seqId, newPath.ids, newPath.probHistory))
-        return;
-    }
-    // outFrameLines_.size() > 1UL
-    if (dataArgsSize_) {
-      newPath.machineIdVec = curPath.machineIdVec;
-      newPath.machineIdVec.push_back(curPathId);
-    }
-    bool atEos =
-        eosVec[index] == 1U || newPath.ids.size() >= (size_t)maxSequenceLength_;
-    // adjustNewPath
-    newPath.adjustProb(calc_id, atEos);
-    if (this->beamSearchCtrlCallbacks_) {
-      this->beamSearchCtrlCallbacks_->normOrDropNode(
-          newPath.seqId, newPath.ids, newPath.probHistory, &newPath.logProb);
-    }
-    if (!newPath.isDropable()) {
-      atEos ? finalPaths_[curPath.seqId].push_back(newPath)
-            : newPaths.push_back(newPath);
-    }
-  }  // for expandWidth
-
-  if (gDiyProbStop) {
-    gDiyProbStop(calc_id);
-  }
-}
-
-void RecurrentGradientMachine::beamExpand(std::vector<Path>& paths,
-                                          std::vector<Path>& newPaths) {
-  size_t candidatePathCount = paths.size();
-  // idVec.size() could be larger than candidatePathCount * beam,
-  // so user can drop some node customly.
-  CHECK_EQ(cpuId_->getSize() % candidatePathCount, 0UL);
-  size_t expandWidth = cpuId_->getSize() / candidatePathCount;
-
-  // iterate over each sequence
-  size_t totalExpandCount = 0;
-  int prevSeqId = -1;
-  int curSeqId = 0;
-  for (size_t j = 0; j <= candidatePathCount; j++) {
-    // expansions of a single sequence are all processed
-    curSeqId = (j < candidatePathCount ? paths[j].seqId : curSeqId + 1);
-    if (prevSeqId != -1 && curSeqId != prevSeqId) {
-      totalExpandCount += beamShrink(newPaths, prevSeqId, totalExpandCount);
-    }
-    if (j == candidatePathCount) return;
-    singlePathExpand(paths[j], j, newPaths, expandWidth);
-
-    prevSeqId = paths[j].seqId;
-  }  // for paths
-}
-
-// Drop extra nodes to beam size.
-size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
-                                            size_t seqId,
-                                            size_t totalExpandCount) {
-  size_t minNewPathSize =
-      std::min(getBeamSize(), newPaths.size() - totalExpandCount);
-  if (!minNewPathSize) {
-    return 0;
-  }
-  std::nth_element(newPaths.begin() + totalExpandCount,
-                   newPaths.begin() + totalExpandCount + minNewPathSize,
-                   newPaths.end(),
-                   Path::greaterPath);
-  newPaths.resize(totalExpandCount + minNewPathSize);
-
-  real minPathLogProb =
-      std::min_element(newPaths.end() - minNewPathSize, newPaths.end())
-          ->logProb;
-  real maxPathLogProb =
-      std::max_element(newPaths.end() - minNewPathSize, newPaths.end())
-          ->logProb;
-
-  // Remove the already formed paths that are relatively short
-  finalPaths_[seqId].erase(
-      std::remove_if(finalPaths_[seqId].begin(),
-                     finalPaths_[seqId].end(),
-                     [&](Path& p) { return p.logProb < minPathLogProb; }),
-      finalPaths_[seqId].end());
-  for (auto p : finalPaths_[seqId]) {
-    if (minFinalPathLogProb_[seqId] > p.logProb) {
-      minFinalPathLogProb_[seqId] = p.logProb;
-    }
-  }
-
-  if (finalPaths_[seqId].size() >= getBeamSize() &&
-      minFinalPathLogProb_[seqId] >= maxPathLogProb) {
-    newPaths.resize(totalExpandCount);
-    return 0;
-  }
-  return minNewPathSize;
-}
-
-void RecurrentGradientMachine::fillGenOutputs() {
-  size_t numResults = generator_.config.num_results_per_sample();
-  for (size_t i = 0; i < finalPaths_.size(); ++i) {
-    size_t minFinalPathsSize = std::min(numResults, finalPaths_[i].size());
-    std::partial_sort(finalPaths_[i].begin(),
-                      finalPaths_[i].begin() + minFinalPathsSize,
-                      finalPaths_[i].end(),
-                      Path::greaterPath);
-    finalPaths_[i].resize(minFinalPathsSize);
-  }
-
-  generator_.ids.clear();
-  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
-  starts[0] = 0;
-  if (numResults > 1) {
-    int idsProbSaveSize = 0;
-    for (auto inSeq : finalPaths_) {
-      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
-      idsProbSaveSize += inSeq.size();
-    }
-    Matrix::resizeOrCreate(
-        generator_.outArg.value, idsProbSaveSize, 1, false, false);
-    real* idsProb = generator_.outArg.value->getData();
-
-    real* probs = generator_.outArg.in->getData();
-    size_t curPos = 0;
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-        Path& path = finalPaths_[i][j];
-        size_t genLen = path.ids.size();
-        generator_.ids.push_back(genLen);  // sequence size
-        generator_.ids.insert(
-            generator_.ids.end(), path.ids.begin(), path.ids.end());
-        generator_.ids.push_back(-1);  // end of sequence
-
-        memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen);
-        curPos += genLen;
-        idsProb[curPos++] = -1.0;
-        probs[i * numResults + j] = path.logProb;
-      }
-      starts[i + 1] = generator_.ids.size();
-    }
-  } else {
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      CHECK(!finalPaths_[i].empty());
-      Path& path = finalPaths_[i][0];
-      generator_.ids.insert(
-          generator_.ids.end(), path.ids.begin(), path.ids.end());
-      starts[i + 1] = starts[i] + path.ids.size();
-    }
-  }
-}
-
-void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
-  for (size_t i = 0; i < dataArgsSize_; i++) {
-    Argument outFrame;
-    outFrame.resizeAndCopyFrom(
-        outFrameLines_[i + 1].frames[machineCur]->getOutput(), useGpu_);
-    dataArgsFrame_[i].emplace_back(outFrame);
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
-    bool isSeq, std::vector<Argument>& outArgs) {
-  batchMachineIdVec_.clear();
-
-  size_t seqIdx = 0;
-  for (size_t i = 0; i < finalPaths_.size(); ++i) {
-    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
-      if (isSeq) {
-        for (size_t i = 0; i < machineIdVec.size(); ++i) {
-          size_t rowId = machineIdVec[i];
-          int* seqPos =
-              outArgs[i].sequenceStartPositions->getMutableData(false);
-          batchMachineIdVec_.push_back(seqPos[rowId]);
-        }
-      } else {
-        batchMachineIdVec_.insert(
-            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
-      }
-      seqIdx++;
-    }
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
-    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
-  size_t totalSeqNum = std::accumulate(
-      finalPaths_.begin(),
-      finalPaths_.end(),
-      0UL,
-      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
-  copySize.resize(totalSeqNum, 1);
-
-  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
-  if (isSeq) {
-    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
-    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
-             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
-    int* starts = inputSeqStartPos->getMutableData(false);
-    int seqId = 0;
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
-                                            : starts[j + 1] - starts[j];
-        batchMachineStartPos_[seqId + 1] =
-            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
-        seqId++;
-      }
-    }
-  } else {
-    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
-      batchMachineStartPos_[i + 1] =
-          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlink() {
-  for (size_t i = 0; i < dataArgsSize_; i++) {
-    bool isSeq = dataArgsFrame_[i][0].hasSeq();
-    std::vector<int> copySize;
-    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
-    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
-
-    dataArgs_[i].concat(dataArgsFrame_[i],
-                        batchMachineIdVec_,
-                        batchMachineStartPos_,
-                        copySize,
-                        useGpu_,
-                        HPPL_STREAM_1,
-                        PASS_TEST);
-    auto dataAgent =
-        dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
-    CHECK_NOTNULL(dataAgent);
-    dataAgent->setData(dataArgs_[i]);
-  }
-}
-
-void RecurrentGradientMachine::beamSearch(size_t batchSize) {
-  finalPaths_.clear();
-  finalPaths_.resize(batchSize);
-  seqIds_.resize(batchSize);
-  minFinalPathLogProb_.clear();
-  minFinalPathLogProb_.resize(batchSize, 0);
-
-  std::vector<Path> paths;
-  std::vector<Path> newPaths;
-  for (size_t i = 0; i < batchSize; ++i) {
-    paths.push_back(Path(i));
-    if (this->beamSearchCtrlCallbacks_) {
-      paths.back().recordHistory();
-    }
-  }
-
-  // restart beam search
-  stopBeamSearch_ = false;
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    int machineCur = i % 2;
-    std::unique_ptr<
-        ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&, int>>
-        statisticsBlock;
-    if (this->beamSearchStatistics_) {
-      auto ptr =
-          new ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&,
-                              int>(beamSearchStatistics_->onEachStepStarted,
-                                   beamSearchStatistics_->onEachStepStoped,
-                                   i);
-      statisticsBlock.reset(ptr);
-    }
-    if (stopBeamSearch_) break;
-
-    if (i) connectPrevFrame(i, paths);
-
-    if (this->beamSearchCtrlCallbacks_) {
-      std::vector<std::vector<int>*> prefixes;
-      prefixes.resize(paths.size());
-      std::transform(
-          paths.begin(), paths.end(), prefixes.begin(), [](const Path& p) {
-            return const_cast<std::vector<int>*>(&p.ids);
-          });
-      beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
-          prefixes, frames_[machineCur].get(), i);
-    }
-
-    forwardFrame(machineCur);
-    beamExpand(paths, newPaths);
-    if (newPaths.empty()) break;
-
-    paths = newPaths;
-    newPaths.clear();
-  }  // end for machineCur
-  fillGenOutputs();
-}
-
-void RecurrentGradientMachine::Path::adjustProb(int calc_id, bool atEos) {
-  if (gDiyProbMethod) {
-    logProb = gDiyProbMethod(calc_id, ids.size(), ids.data(), logProb, atEos);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h
deleted file mode 100644
index 6ea54f4a53d466594055db2fb5167fa1a9d6c9da..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AddtoLayer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * This layer just simply add all input layers together, then activate
- * the sum inputs. Each input of this layer should be the same size,
- * which is also the output size of this layer.
- * \f[
- *   y=f(\sum_{i}x_i + b)
- * \f]
- * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is
- * activation function.
- *
- * The config file api is addto_layer.
- */
-class AddtoLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit AddtoLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~AddtoLayer() {}
-
-  /**
-   * Intialization of AddtoLayer.
-   */
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * Forward propagation.
-   * @note There is no weight matrix for each input,
-   *       because it just a simple add operation.
-   */
-  void forward(PassType passType) override;
-
-  /**
-   * Backward propagation.
-   */
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
deleted file mode 100644
index 51f346d5c9fdf9599cddf4b668c128035fd94187..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AgentLayer.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * AgentLayer use as a virtual input of another layer in config,
- * before execute forward/backward, setRealLayer() should be
- * called to set one and only one real layer
- */
-class AgentLayer : public Layer {
- protected:
-  LayerPtr realLayer_;
-  int numSamples_;
-
- public:
-  explicit AgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~AgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  // if *numSamples* set,
-  // real layer output will only use first *numSamples* rows
-  void setRealLayer(LayerPtr layer, int numSamples = 0) {
-    realLayer_ = layer;
-    numSamples_ = numSamples;
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override {}
-};
-
-/**
- * Like AgentLayer, but it can gather many real layers. Each real
- * layer give a few rows of a sequence, after gather all real layers,
- * GatherAgentLayer collect a complete sequence.
- */
-class GatherAgentLayer : public Layer {
- protected:
-  std::vector<LayerPtr> realLayers_;
-  std::vector<IVectorPtr> idsVec_;
-  // we don't clear idsVec_ vector to aviod IVector alloc/free
-  IVectorPtr allIds_;
-  std::vector<int> idIndex_;
-
- public:
-  explicit GatherAgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~GatherAgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  // call before addRealLayer
-  void clearRealLayers() { realLayers_.clear(); }
-
-  void copyIdAndSequenceInfo(ICpuGpuVectorPtr sequenceStartPositions,
-                             ICpuGpuVectorPtr subSequenceStartPositions,
-                             const IVectorPtr& allIds,
-                             const std::vector<int>& idIndex);
-
-  // add one real layer, can call many times
-  void addRealLayer(LayerPtr layer) { realLayers_.push_back(layer); }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-  void forwardValue(PassType passType);
-  void forwardIds(PassType passType);
-};
-
-/**
- * Like AgentLayer, but only select a few rows in real layer.
- * [idIndex, idIndex + idSize) of *ids* in setRealLayerAndOutput()
- * are the selected row ids. It's used to scatter one layer's output
- * to many small submodels. ScatterAgentLayer can support ids real layer,
- * if it is, the agent will select a few ids in real layer.
- */
-class ScatterAgentLayer : public Layer {
- protected:
-  LayerPtr realLayer_;
-  IVectorPtr ids_;
-  IVectorPtr cpuIds_;
-  Argument realOutArg_;
-  int idIndex_;
-  int idSize_;
-  int seqStartPosIndex_;
-  int numSequences_;  // number of sequences in this scatterAgentLayer
-  bool handleBackward_;
-
-  // use to store expanded cpuStartPositions or subSequenceStartPositions
-  // of real layer.
-  ICpuGpuVectorPtr inputStartPos_;
-
-  // true for setRealLayer, false for setRealLayerAndOutput
-  bool selectionMode_;
-
- public:
-  explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~ScatterAgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * @brief set real layer in generation
-   *
-   * @param layer[input]    realLayer
-   * @param ids[input]      row id in real layer
-   * @param copyId[input]   whether to copy a cpu version of ids,
-   *                        false(default) in ScatterAgentLayer, and
-   *                        true in SequenceScatterAgentLayer.
-   */
-  void setRealLayer(LayerPtr layer, const std::vector<int>& ids) {
-    realLayer_ = layer;
-    IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
-    ids_->copyFrom(ids.data(), ids.size());
-    if (useGpu_) {
-      IVector::resizeOrCreate(cpuIds_, ids.size(), false);
-      cpuIds_->copyFrom(ids.data(), ids.size());
-    } else {
-      cpuIds_ = ids_;
-    }
-    selectionMode_ = true;
-  }
-
-  // set real layer and output, [idIndex, idIndex + idSize) of *ids*
-  // are selected row for realOutArg in realLayer
-  void setRealLayerAndOutput(LayerPtr layer,
-                             const Argument& outArg,
-                             const IVectorPtr& ids,
-                             int idIndex,
-                             int idSize,
-                             bool handleBackward) {
-    realLayer_ = layer;
-    realOutArg_ = outArg;
-    ids_ = ids;
-    idIndex_ = idIndex;
-    idSize_ = idSize;
-    handleBackward_ = handleBackward;
-    selectionMode_ = false;
-  }
-
-  void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
-                                 int seqStartPosIndex,
-                                 int numSequences) {
-    realOutArg_.sequenceStartPositions = sequenceStartPositions;
-    seqStartPosIndex_ = seqStartPosIndex;
-    numSequences_ = numSequences;
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  void forwardWithSelection(PassType passType);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
deleted file mode 100644
index 03e2673b55ceca7a698f1b858327ad6fad739087..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AverageLayer.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "SequencePoolLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer for "internal average" for sequence input.
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = average_{for each instance in this sequence}{input[i]}
- *    If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and the average pooling
- *              operation is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-class AverageLayer : public SequencePoolLayer {
- public:
-  enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
-  explicit AverageLayer(const LayerConfig& config)
-      : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  int mode_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BilinearInterpLayer.h b/paddle/gserver/layers/BilinearInterpLayer.h
deleted file mode 100644
index 8e08c2e1ce80172f55c93d8242821f683fa1a731..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BilinearInterpLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for bilinear interpolation which is
- *        used on conv layer output.
- *
- * @note  The config file api is bilinear_interp_layer.
- */
-class BilinearInterpLayer : public Layer {
- protected:
-  size_t outImgH_, outImgW_;
-  size_t inImgH_, inImgW_;
-  real ratioH_, ratioW_;
-  size_t numChannels_;
-
- public:
-  explicit BilinearInterpLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~BilinearInterpLayer() {}
-
-  size_t getSize();
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
deleted file mode 100644
index 9d76584f3a4eda19a9e8f806256a7b8da617cc37..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Expand feature map to minibatch matrix.
- * - matrix width is: blockH_ * blockW_ * channels_
- * - matirx height is: outputH_ * outputW_
- *
- * \f[
- * outputH\_ = 1 + (2 * paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) /
- *             strideH\_ \\
- * outputW\_ = 1 + (2 * paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) /
- *             strideW\_
- * \f]
- *
- * The expand method is the same with ExpandConvLayer, but saved the transposed
- * value. After expanding, output_.sequenceStartPositions will store timeline.
- * The number of time steps are outputH_ * outputW_ and the dimension of each
- * time step is blockH_ * blockW_ * channels_. This layer can be used after
- * convolution neural network, and before recurrent neural network.
- *
- * The config file api is block_expand_layer.
- */
-class BlockExpandLayer : public Layer {
- protected:
-  /**
-   * @brief Calculate outputH_ and outputW_ and return block number which
-   * actually is time steps.
-   * @return time steps, outoutH_ * outputW_.
-   */
-  size_t getBlockNum();
-  size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
-  size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
-
-  TensorShape inputShape_;
-  TensorShape outputShape_;
-
- public:
-  explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~BlockExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Conv3DLayer.h b/paddle/gserver/layers/Conv3DLayer.h
deleted file mode 100644
index 07b804bad02beb6ec9c3e9fd43c3cd3aa6d50b22..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Conv3DLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of convolution layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution operation.
- */
-class Conv3DLayer : public ConvBaseLayer {
- public:
-  explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-  ~Conv3DLayer() {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void addBias();
-  void backward(const UpdateCallback& callback);
-  void bpropBiases();
-  void bpropData(int i);
-  void bpropWeights(int i);
-  size_t getSize();
-
- protected:
-  // Figure out the dimensions for individual gemms.
-  IntV M_;  /// numFilters_ / filter_group_;
-  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
-  IntV K_;  /// outputD_ * outputH_ * outputW_
-  MatrixPtr colBuf_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
deleted file mode 100644
index 56bf4f9fcb187f73409076b826b738f62d19516a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvBaseLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
-namespace paddle {
-
-bool ConvBaseLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv")
-                  ? false
-                  : true;
-
-  /* Initialize the convolutional layer parameter */
-  numFilters_ = config_.num_filters();
-  sharedBiases_ = config_.shared_biases();
-  for (auto& inputConfig : config_.inputs()) {
-    const ConvConfig& conf = inputConfig.conv_conf();
-    padding_.push_back(conf.padding());
-    stride_.push_back(conf.stride());
-    dilation_.push_back(conf.dilation());
-    filterSize_.push_back(conf.filter_size());
-    paddingY_.push_back(conf.padding_y());
-    strideY_.push_back(conf.stride_y());
-    dilationY_.push_back(conf.dilation_y());
-    filterSizeY_.push_back(conf.filter_size_y());
-    channels_.push_back(conf.channels());
-    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
-                                              : conf.img_size());
-    imgSizeW_.push_back(conf.img_size());
-    groups_.push_back(conf.groups());
-    filterChannels_.push_back(conf.filter_channels());
-    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
-    outputW_.push_back(conf.output_x());
-
-    paddingZ_.push_back(conf.padding_z());
-    strideZ_.push_back(conf.stride_z());
-    filterSizeZ_.push_back(conf.filter_size_z());
-    imgSizeD_.push_back(conf.img_size_z());
-    outputD_.push_back(conf.output_z());
-    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
-                            filterSizeZ_.back());
-  }
-
-  CHECK(inputLayers_.size() == parameters_.size());
-
-  // create new weights_ in derived class
-  // create new biases_ in derived class
-
-  // default caffe model
-  caffeMode_ = true;
-
-  return true;
-}
-
-size_t ConvBaseLayer::calOutputSize() {
-  auto clearAndReserve = [this](IntV* vec) {
-    vec->clear();
-    vec->reserve(this->inputLayers_.size());
-  };
-  clearAndReserve(&imgSizeH_);
-  clearAndReserve(&imgSizeW_);
-  clearAndReserve(&outputH_);
-  clearAndReserve(&outputW_);
-  size_t layerSize = 0;
-
-  auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
-    size_t filterSizeY;
-    size_t filterSize;
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      filterSizeY = (filterSizeY_[i] - 1) * dilationY_[i] + 1;
-      filterSize = (filterSize_[i] - 1) * dilation_[i] + 1;
-      inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
-      inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
-      const ConvConfig& conf = config_.inputs(i).conv_conf();
-      if (isDeconv_) {
-        if (inH[i] == 0)
-          inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
-        if (inW[i] == 0) inW[i] = conf.output_x();
-        outH.push_back(imageSize(
-            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
-        outW.push_back(
-            imageSize(inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
-      } else {
-        if (inH[i] == 0)
-          inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-        if (inW[i] == 0) inW[i] = conf.img_size();
-        outH.push_back(outputSize(
-            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
-        outW.push_back(outputSize(
-            inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
-      }
-      CHECK_EQ(outH[i], outH[0]);
-      CHECK_EQ(outW[i], outW[0]);
-    }
-    getOutput().setFrameHeight(outH[0]);
-    getOutput().setFrameWidth(outW[0]);
-    layerSize = outH[0] * outW[0] * size_t(numFilters_);
-  };
-
-  setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
-
-  return layerSize;
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
deleted file mode 100644
index 801bc4f888c5a60e803c882dcf807678c64af20c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/MathUtils.h"
-namespace paddle {
-
-/**
- * @brief A Base Convolution Layer, which convolves the input image
- * with learned filters and (optionally) adds biases.
- */
-
-class ConvBaseLayer : public Layer {
- protected:
-  typedef std::vector<int> IntV;
-
-  /// True if it's deconv layer, false if it's convolution layer
-  bool isDeconv_;
-
-  /// The number of filters.
-  int numFilters_;
-  /// The x dimension of the padding.
-  IntV padding_;
-  /// The y dimension of the padding.
-  IntV paddingY_;
-  /// The x dimension of the stride.
-  IntV stride_;
-  /// The y dimension of the stride.
-  IntV strideY_;
-  /// The x dimension of the dilation.
-  IntV dilation_;
-  /// The y dimension of the dilation.
-  IntV dilationY_;
-  /// The x dimension of a filter kernel.
-  IntV filterSize_;
-  /// The y dimension of a filter kernel.
-  IntV filterSizeY_;
-  /// The spatial dimensions of the convolution input.
-  IntV channels_;
-  /// The spatial dimensions of input feature map height.
-  IntV imgSizeH_;
-  /// The spatial dimensions of input feature map width.
-  IntV imgSizeW_;
-  /// filterPixels_ = filterSizeX_ * filterSizeY_.
-  IntV filterPixels_;
-  /// filterChannels_ = channels_/groups_.
-  IntV filterChannels_;
-  /// The spatial dimensions of output feature map height.
-  IntV outputH_;
-  /// The spatial dimensions of output feature map width.
-  IntV outputW_;
-
-  IntV outputD_;
-  IntV imgSizeD_;
-  IntV filterSizeZ_;
-  IntV strideZ_;
-  IntV paddingZ_;
-
-  /// Group size, refer to grouped convolution in
-  /// Alex Krizhevsky's paper: when group=2, the first half of the
-  /// filters are only connected to the first half of the input channels,
-  /// and the second half only connected to the second half.
-  IntV groups_;
-  /// Whether the bias is shared for feature in each channel.
-  bool sharedBiases_;
-
-  /// shape of weight: (numChannels * filterPixels_, numFilters)
-  WeightList weights_;
-  /// If shared_biases is false shape of bias: (numFilters_, 1)
-  /// If shared_biases is ture shape of bias:
-  /// (numFilters_ * outputX * outputY, 1)
-  std::unique_ptr<Weight> biases_;
-
-  /// True by default. The only difference is the calculation
-  /// of output size.
-  bool caffeMode_;
-
- public:
-  explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
-   * in this function. Then it will calculate outputH_ and outputW_ and set them
-   * into output argument.
-   */
-  virtual size_t calOutputSize();
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseOperator.cpp b/paddle/gserver/layers/ConvBaseOperator.cpp
deleted file mode 100644
index 317e7d5c607683efa1e93aba9bc9ba472d37d60d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseOperator.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvBaseOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvBaseOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-ConvBaseOperator::ConvBaseOperator(const OperatorConfig &config, bool useGpu)
-    : Operator(config, useGpu) {
-  CHECK(useGpu);
-  CHECK_EQ(config_.input_indices_size(), 2L);
-
-  caffeMode_ = true;
-  getConvParams();
-  computeConvSizes();
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-  workSpace_ = nullptr;
-
-  isSelectAlgo_ = false;
-}
-
-void ConvBaseOperator::allocConvWorkSpace() {
-  hl_conv_workspace(imageDesc_,
-                    outputDesc_,
-                    filterDesc_,
-                    convDesc_,
-                    &fwdAlgo_,
-                    &fwdLimitBytes_,
-                    &bwdDataAlgo_,
-                    &bwdDataLimitBytes_,
-                    &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_,
-                    /*useDilation*/ false);
-
-  size_t maxWorkSpace = 0;
-  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-
-  if (maxWorkSpace > workSpaceInBytes_) {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-    }
-    // total amount of storage needed
-    workSpace_ = hl_malloc_device(maxWorkSpace);
-    workSpaceInBytes_ = maxWorkSpace;
-  }
-}
-
-void ConvBaseOperator::computeConvSizes() {
-  hl_create_filter_descriptor(
-      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
-  hl_create_tensor_descriptor(&imageDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   imageDesc_,
-                                   filterDesc_,
-                                   paddingY_,
-                                   padding_,
-                                   strideY_,
-                                   stride_);
-}
-
-void ConvBaseOperator::reshapeImageDescriptors() {
-  hl_tensor_reshape(imageDesc_,
-                    1,
-                    channels_,
-                    imageH_,
-                    imageW_,
-                    channels_ * imageH_ * imageW_,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-  hl_tensor_reshape(outputDesc_,
-                    1,
-                    numFilters_,
-                    outputH_,
-                    outputW_,
-                    numFilters_ * outputH_ * outputW_,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
-  hl_reset_convolution_descriptor(convDesc_,
-                                  imageDesc_,
-                                  filterDesc_,
-                                  paddingY_,
-                                  padding_,
-                                  strideY_,
-                                  stride_);
-}
-
-void ConvBaseOperator::getConvParams() {
-  configNumFilters_ = config_.num_filters();
-  const ConvConfig &conf = config_.conv_conf();
-  padding_ = conf.padding();
-  stride_ = conf.stride();
-  filterSize_ = conf.filter_size();
-  paddingY_ = conf.padding_y();
-  strideY_ = conf.stride_y();
-  filterSizeY_ = conf.filter_size_y();
-  filterPixels_ = filterSize_ * filterSizeY_;
-  configChannels_ = conf.channels();
-  imgSize_ = conf.img_size();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  imgPixels_ = imgSize_ * imgSizeY_;
-  CHECK_EQ(conf.groups(), 1U);
-  filterChannels_ = conf.filter_channels();
-  outputX_ = conf.output_x();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  outputs_ = outputX_ * outputX_;
-
-  isDeconv_ = (config_.type() == "conv") ? false : true;
-  if (isDeconv_) {
-    channels_ = configNumFilters_;
-    numFilters_ = configChannels_;
-  } else {
-    channels_ = configChannels_;
-    numFilters_ = configNumFilters_;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseOperator.h b/paddle/gserver/layers/ConvBaseOperator.h
deleted file mode 100644
index c3c647cb69da5a70eb5346737cc0092e2201c89e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseOperator.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "Operator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvBaseOperator : public Operator {
- public:
-  ConvBaseOperator(const OperatorConfig &config, bool useGpu);
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvBaseOperator() {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-      workSpaceInBytes_ = 0;
-    }
-
-    hl_destroy_tensor_descriptor(imageDesc_);
-    hl_destroy_tensor_descriptor(outputDesc_);
-    hl_destroy_filter_descriptor(filterDesc_);
-    hl_destroy_convolution_descriptor(convDesc_);
-  }
-
- protected:
-  /**
-   * Get convolution parameters from layer config and
-   * initialize member variables.
-   */
-  void getConvParams();
-
-  /**
-   * Allocate Gpu Memory for cudnn convolution algorithms.
-   */
-  void allocConvWorkSpace();
-
-  /**
-   * Create cudnn tensor descriptor for convolution operation.
-   */
-  void computeConvSizes();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  void reshapeImageDescriptors();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  virtual void reshape(int batchSize) = 0;
-
-  /**
-   * Check filter size is equal to the size calculated by parameters from
-   * layer config.
-   */
-  void checkFilterSize(const MatrixPtr &filter) {
-    CHECK_EQ(static_cast<int>(filter->getWidth()),
-             filterSize_ * filterSizeY_ * channels_ * numFilters_);
-  }
-
-  /// Most of member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  bool isDeconv_;
-  int imageH_, imageW_, outputH_, outputW_;
-  hl_tensor_descriptor imageDesc_;
-  hl_tensor_descriptor outputDesc_;
-  hl_filter_descriptor filterDesc_;
-  hl_convolution_descriptor convDesc_;
-  bool caffeMode_;
-  int inputOffset_, outputOffset_, weightOffset_;
-  int numFilters_, channels_;
-
-  /// from parsing config
-  int configNumFilters_, configChannels_;
-  int padding_, stride_, filterSize_, imgSize_, imgSizeY_;
-  int paddingY_, strideY_, filterSizeY_;
-  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
-
-  /// Following member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
-  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
-  size_t workSpaceInBytes_;
-  void *workSpace_;
-  bool isSelectAlgo_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
deleted file mode 100644
index f3266ae1ab945042cde9f24b7c2673c18d37bc11..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-#include "paddle/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Base class for ConvProjection and ConvTransProjection.
- */
-class ConvBaseProjection : public Projection {
- public:
-  /**
-   * Constructor.
-   */
-  ConvBaseProjection(const ProjectionConfig& config,
-                     ParameterPtr parameter,
-                     bool useGpu);
-
-  ~ConvBaseProjection();
-
- protected:
-  void getConvParams();
-  void initCudnn();
-
-  void reshapeTensorDesc(int batchSize);
-  void reshape(int batchSize);
-
-  virtual size_t calOutputSize() = 0;
-  virtual size_t calInputSize() = 0;
-
-  static void* getSpaceBytes(size_t size);
-
-  /// True if it's deconv projection layer, false if it's ConvProjection layer
-  bool isDeconv_;
-  /// imageH_ and imageW_ / outputH_ and outputW_
-  /// is calculated from the input layer.
-  int imageH_, imageW_;
-  int outputH_, outputW_;
-  /// configImgH_ and configImgW_ / configOutH_ and configOutW_
-  /// is obtained from config.
-  int configImgH_, configImgW_;
-  int configOutH_, configOutW_;
-  /// channels_ and numFilters_ are defined in terms of convolution semantics
-  int channels_, numFilters_;
-  /// configChannels and configNumFilters_ are obtained from config
-  /// For Conv they are the same as channels_ and numFilters
-  /// For ConvTrans they are opposite to channels_ and numFilters
-  int configChannels_, configNumFilters_;
-  int paddingH_, paddingW_;
-  int strideH_, strideW_;
-  int dilationH_, dilationW_;
-  int filterH_, filterW_;
-  /// One group offset of input data.
-  int inputOffset_;
-  /// One group offset of output data.
-  int outputOffset_;
-  /// One group offset of weight.
-  int weightOffset_;
-  int groups_;
-
-  /// Cudnn tensor descriptor for input.
-  hl_tensor_descriptor imageDesc_;
-  /// Cudnn tensor descriptor for output.
-  hl_tensor_descriptor outputDesc_;
-  /// Cudnn tensor descriptor for filter.
-  hl_filter_descriptor filterDesc_;
-  /// Cudnn tensor descriptor for a convolution operation.
-  hl_convolution_descriptor convDesc_;
-
-  /// Record the algorithm for forward convolution, which is obtained by cudnn
-  /// api to search the best suited algorithm.
-  int fwdAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// filter coefficients.
-  int bwdFilterAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// the output.
-  int bwdDataAlgo_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// forward convolution with the specified algo.
-  size_t fwdLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardFilter with the specified algo.
-  size_t bwdDataLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardData with the specified algo.
-  size_t bwdFilterLimitBytes_;
-  /// Size of total work space.
-  size_t workSpaceInBytes_;
-  bool bias_;
-
-  std::unique_ptr<Weight> weight_;
-  static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp
deleted file mode 100644
index 45498b92d32e0fa72adbe95a98e8d30c7f8929e2..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-REGISTER_OPERATOR(conv, ConvOperator);
-
-void ConvOperator::reshape(int batchSize) {
-  imageH_ = ins_[0]->getFrameHeight();
-  imageW_ = ins_[0]->getFrameWidth();
-  if (imageH_ == 0) imageH_ = imgSizeY_;
-  if (imageW_ == 0) imageW_ = imgSize_;
-  outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
-  outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
-  /// Check that the outputSizes are consistent with config
-  CHECK_EQ(outputH_, outputY_);
-  CHECK_EQ(outputW_, outputX_);
-  out_->setFrameHeight(outputH_);
-  out_->setFrameWidth(outputW_);
-
-  reshapeImageDescriptors();
-
-  inputOffset_ = channels_ * imageH_ * imageW_;
-  outputOffset_ = numFilters_ * outputH_ * outputW_;
-  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace();
-  }
-
-  isSelectAlgo_ = true;
-}
-
-void ConvOperator::forward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  reshape(batchSize);
-  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
-  checkFilterSize(ins_[1]->value);
-  Matrix::resizeOrCreate(out_->value,
-                         batchSize,
-                         outputH_ * outputW_ * numFilters_,
-                         false,
-                         useGpu_);
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-      real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_forward(imageDesc_,
-                             inputData,
-                             outputDesc_,
-                             outData,
-                             filterDesc_,
-                             wgtData,
-                             convDesc_,
-                             workSpace_,
-                             workSpaceInBytes_,
-                             fwdAlgo_);
-    }
-  }
-}
-
-void ConvOperator::backward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
-      if (ins_[1]->grad) {
-        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(imageDesc_,
-                                       inputData,
-                                       outputDesc_,
-                                       outGrad,
-                                       filterDesc_,
-                                       weightGrad,
-                                       convDesc_,
-                                       workSpace_,
-                                       workSpaceInBytes_,
-                                       bwdFilterAlgo_);
-      }
-
-      MatrixPtr preGrad = ins_[0]->grad;
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
-        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_data(imageDesc_,
-                                     inputGrad,
-                                     outputDesc_,
-                                     outGrad,
-                                     filterDesc_,
-                                     wgtData,
-                                     convDesc_,
-                                     workSpace_,
-                                     workSpaceInBytes_,
-                                     bwdDataAlgo_);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvOperator.h b/paddle/gserver/layers/ConvOperator.h
deleted file mode 100644
index 527dbf8c270f35e19ca23acd8a3ba8197d03b988..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvOperator.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "ConvBaseOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvOperator : public ConvBaseOperator {
- public:
-  ConvOperator(const OperatorConfig &config, bool useGpu)
-      : ConvBaseOperator(config, useGpu) {}
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvOperator() {}
-  void forward() override;
-  void backward() override;
-  void reshape(int batchSize) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
deleted file mode 100644
index 22a2202bb6cc256a4a5897724d8eb8a93fefb79f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvProjection.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ConvBaseProjection.h"
-#include "paddle/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Convolution projection do the same calculation with CudnnConvLayer.
- */
-class ConvProjection : public ConvBaseProjection {
- public:
-  /**
-   * Constructor.
-   */
-  ConvProjection(const ProjectionConfig& config,
-                 ParameterPtr parameter,
-                 bool useGpu)
-      : ConvBaseProjection(config, parameter, useGpu) {}
-
-  ~ConvProjection() {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-  virtual size_t calOutputSize();
-  virtual size_t calInputSize();
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
deleted file mode 100644
index 615c3478061b591ea30cbf0b3d27ef2551c0dd28..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for circular convluation of two vectors,
- * which is used in NEURAL TURING MACHINE.
- * - Input: two vectors, the first is data (batchSize x dataDim)
- * the second is shift weights (batchSize x shiftDim)
- * - Output: a vector (batchSize x dataDim)
- * Assumed that:
- * - a[in]: contains M elements.
- * - b[in]: contains N elements (N should be odd).
- * - c[out]: contains M elements.
- *
- * \f[
- *     c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
- * \f]
- *
- * In this formula:
- *  - a's index is computed modulo M.
- *  - b's index is comupted modulo N.
- *
- * The config file api is conv_shift_layer.
- */
-
-class ConvShiftLayer : public Layer {
- public:
-  explicit ConvShiftLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ConvShiftLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(conv_shift, ConvShiftLayer);
-
-bool ConvShiftLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void ConvShiftLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dataDim = inV0->getWidth();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-  CHECK_EQ(dataDim, getSize());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  REGISTER_TIMER_INFO("FwConvShiftTimer", getName().c_str());
-  outV->circularConv(*inV0, *inV1);
-}
-
-void ConvShiftLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  REGISTER_TIMER_INFO("BwConvShiftTimer", getName().c_str());
-
-  if (inG0 && inG1) {
-    outG->circularConvDerivative(*outG, *inV0, *inV1, *inG0, *inG1);
-  } else {
-    CHECK(!inG0 || !inG1) << "Not supported";
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransOperator.cpp b/paddle/gserver/layers/ConvTransOperator.cpp
deleted file mode 100644
index ac41d6f9a4f86364930e27ee401406432e731b65..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvTransOperator.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvTransOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvTransOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-REGISTER_OPERATOR(convt, ConvTransOperator);
-
-void ConvTransOperator::reshape(int batchSize) {
-  outputH_ = ins_[0]->getFrameHeight();
-  outputW_ = ins_[0]->getFrameWidth();
-  if (outputH_ == 0) outputH_ = outputY_;
-  if (outputW_ == 0) outputW_ = outputX_;
-  imageH_ = imageSize(outputH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
-  imageW_ = imageSize(outputW_, filterSize_, padding_, stride_, caffeMode_);
-  /// Check that the imageSizes are consistent with config
-  CHECK_EQ(imageH_, imgSizeY_);
-  CHECK_EQ(imageW_, imgSize_);
-  out_->setFrameHeight(imageH_);
-  out_->setFrameWidth(imageW_);
-
-  reshapeImageDescriptors();
-
-  inputOffset_ = numFilters_ * outputH_ * outputW_;
-  outputOffset_ = channels_ * imageH_ * imageW_;
-  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace();
-  }
-
-  isSelectAlgo_ = true;
-}
-
-void ConvTransOperator::forward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  reshape(batchSize);
-  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
-  checkFilterSize(ins_[1]->value);
-  Matrix::resizeOrCreate(
-      out_->value, batchSize, imageH_ * imageW_ * channels_, false, useGpu_);
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-      real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_backward_data(imageDesc_,
-                                   outData,
-                                   outputDesc_,
-                                   inputData,
-                                   filterDesc_,
-                                   wgtData,
-                                   convDesc_,
-                                   workSpace_,
-                                   workSpaceInBytes_,
-                                   bwdDataAlgo_);
-    }
-  }
-}
-
-void ConvTransOperator::backward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
-      if (ins_[1]->grad) {
-        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(imageDesc_,
-                                       outGrad,
-                                       outputDesc_,
-                                       inputData,
-                                       filterDesc_,
-                                       weightGrad,
-                                       convDesc_,
-                                       workSpace_,
-                                       workSpaceInBytes_,
-                                       bwdFilterAlgo_);
-      }
-
-      MatrixPtr preGrad = ins_[0]->grad;
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
-        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_forward(imageDesc_,
-                               outGrad,
-                               outputDesc_,
-                               inputGrad,
-                               filterDesc_,
-                               wgtData,
-                               convDesc_,
-                               workSpace_,
-                               workSpaceInBytes_,
-                               fwdAlgo_);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransOperator.h b/paddle/gserver/layers/ConvTransOperator.h
deleted file mode 100644
index 53cb7a21b49189898d09aa20cd46d04cc5c20198..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvTransOperator.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "ConvBaseOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvTransOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvTransOperator : public ConvBaseOperator {
- public:
-  ConvTransOperator(const OperatorConfig &config, bool useGpu)
-      : ConvBaseOperator(config, useGpu) {}
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvTransOperator() {}
-  void forward() override;
-  void backward() override;
-  void reshape(int batchSize) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransProjection.h b/paddle/gserver/layers/ConvTransProjection.h
deleted file mode 100644
index 0f9ed720d3b8855a3a24ac25a1c3917c4b98e81d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvTransProjection.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ConvBaseProjection.h"
-#include "paddle/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Convolution projection do the same calculation with CudnnConvLayer.
- */
-class ConvTransProjection : public ConvBaseProjection {
- public:
-  /**
-   * Constructor.
-   */
-  ConvTransProjection(const ProjectionConfig& config,
-                      ParameterPtr parameter,
-                      bool useGpu)
-      : ConvBaseProjection(config, parameter, useGpu) {}
-
-  ~ConvTransProjection() {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-  virtual size_t calOutputSize();
-  virtual size_t calInputSize();
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
deleted file mode 100644
index 31363d97c4fd318ec2c6d48f9200f6ba1f49ba11..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for weighted sum of vectors,
- * which is used in NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND
- * TRANSLATE
- * - Input: the the size of the first input is weightDim,
- *          and the size of the second input is weightdim * dataDim.
- * - Output: the sizeof the output is dataDim
- * \f[
- *   out(j) = \sum_{i}(in0(i) * in1(i,j + i * dataDim)),
- *               i = 0,1,...,(weightDim-1); j = 0, 1,...,(dataDim-1)
- * \f]
- * Note that the above computation is for one sample. Multiple samples are
- * processed in one batch.
- *
- * The config file api is linear_comb_layer.
- */
-class ConvexCombinationLayer : public Layer {
- protected:
-  /// A matrix pointer pointing to second input.
-  MatrixPtr tmpMtx0;
-  /// A matrix pointer pointing to first input.
-  MatrixPtr tmpRow0;
-  /// A matrix pointer pointing to output.
-  MatrixPtr tmpRow1;
-
- public:
-  explicit ConvexCombinationLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ConvexCombinationLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(convex_comb, ConvexCombinationLayer);
-
-bool ConvexCombinationLayer::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(2U, inputLayers_.size());
-  size_t dataDim = getSize();
-  size_t weightDim = inputLayers_[0]->getSize();
-
-  CHECK_EQ(weightDim * dataDim, inputLayers_[1]->getSize())
-      << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           weightDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow1 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ weightDim,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-
-  return true;
-}
-
-void ConvexCombinationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t weightDim = inV0->getWidth();
-  size_t dataDim = getSize();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  REGISTER_TIMER_INFO("FwCvxCombTimer", getName().c_str());
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
-    tmpRow0->setData(inV0->getData() + i * weightDim);
-    tmpRow1->setData(outV->getData() + i * dataDim);
-
-    tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 0);
-  }
-}
-
-void ConvexCombinationLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t weightDim = inV0->getWidth();
-  size_t dataDim = getSize();
-
-  REGISTER_TIMER_INFO("BwCvxCombTimer", getName().c_str());
-
-  if (inG0) {
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpRow0->setData(inG0->getData() + i * weightDim);
-      tmpRow1->setData(outG->getData() + i * dataDim);
-      tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
-
-      tmpRow0->mul(*tmpRow1, *(tmpMtx0->getTranspose()), 1, 1);
-    }
-  }
-
-  if (inG1) {
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpRow0->setData(inV0->getData() + i * weightDim);
-      tmpRow1->setData(outG->getData() + i * dataDim);
-      tmpMtx0->setData(inG1->getData() + i * weightDim * dataDim);
-
-      tmpMtx0->mul(*(tmpRow0->getTranspose()), *tmpRow1, 1, 1);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
deleted file mode 100644
index d9fe1ff270f1f76e3b246dca374ddf45445419f9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CosSimLayer.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * @brief A layer for calculating cosine similarity between two vector
- * \f[
- * f(x,y)=scale\frac{x_1y_1+x_2y_2+...+x_ny_n}{\sqrt{x_1^2+x_2^2+...
- * +x_n^2}\sqrt{y_1^2+y_2^2+...+y_n^2}}
- * \f]
- *
- * - Input1: A vector (batchSize * dataDim) *
- * - Input2: A vector (batchSize * dataDim) or (1 * dataDim) *
- * - Output: A vector (batchSize * 1)
- *
- * The config file api is cos_sim.
- */
-class CosSimLayer : public Layer {
- public:
-  explicit CosSimLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~CosSimLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
deleted file mode 100644
index 230ecc768b4d7314b21ac1d76899c3c3bab12309..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-/**
- * @brief A layer for computing cosine similarity between a vector
- * and each row of a matrix
- * out[i] = cos_scale * cos(in1, in2(i,:));
- * @note used in NEURAL TURING MACHINE
- *
- * Input1: a vector (batchSize * dataDim)
- *
- * Input2: a matrix in vector form (batchSize * (weightDim*dataDim))
- *
- * Output: a vector (batchSize * weightDim)
- */
-
-class CosSimVecMatLayer : public Layer {
- protected:
-  MatrixPtr tmpMtx0;
-  MatrixPtr tmpMtx1;
-  MatrixPtr tmpRow0;
-  MatrixPtr tmpRow1;
-  MatrixPtr tmpRow2;
-  MatrixPtr tmpRow3;
-
- public:
-  explicit CosSimVecMatLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~CosSimVecMatLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(cos_vm, CosSimVecMatLayer);
-
-bool CosSimVecMatLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  size_t dataDim = inputLayers_[0]->getSize();
-  size_t numKeys = getSize();
-  size_t memoryDim = inputLayers_[1]->getSize();
-
-  CHECK_EQ(dataDim * numKeys, memoryDim) << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow1 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow2 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           1,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow3 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           1,
-                           /* trans= */ false,
-                           useGpu_);
-
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpMtx1 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-
-  CHECK(tmpRow0 && tmpRow1 && tmpRow2 && tmpRow3 && tmpMtx0 && tmpMtx1);
-
-  createFunction(forward_,
-                 "CosSimForward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-  createFunction(backward_,
-                 "CosSimBackward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-
-  return true;
-}
-
-void CosSimVecMatLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t numKeys = getSize();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, numKeys);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  CHECK(outV && inV0 && inV1);
-  REGISTER_TIMER_INFO("FwCosVMTimer", getName().c_str());
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpRow0->setData(inV0->rowBuf(i));
-    tmpMtx0->setData(inV1->rowBuf(i));
-    tmpRow2->setData(outV->rowBuf(i));
-
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*tmpMtx0);
-    inputs.addArg(*tmpRow0);
-    outputs.addArg(*tmpRow2, ASSIGN_TO);
-    forward_[0]->calc(inputs, outputs);
-  }
-}
-
-void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
-  CHECK_EQ(backward_.size(), 1UL) << "Only one forward function needed";
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV0->getHeight();
-  CHECK(inV0 && inV1 && inG0 && inG1 && outV && outG);
-  REGISTER_TIMER_INFO("BwCosVMTimer", getName().c_str());
-
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpRow0->setData(inV0->rowBuf(i));
-    tmpRow1->setData(inG0->rowBuf(i));
-    tmpMtx0->setData(inV1->rowBuf(i));
-    tmpMtx1->setData(inG1->rowBuf(i));
-    tmpRow2->setData(outV->rowBuf(i));
-    tmpRow3->setData(outG->rowBuf(i));
-
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*tmpRow3);
-    inputs.addArg(*tmpRow2);
-    inputs.addArg(*tmpMtx0);
-    inputs.addArg(*tmpRow0);
-    outputs.addArg(*tmpMtx1, ADD_TO);
-    outputs.addArg(*tmpRow1, ADD_TO);
-
-    backward_[0]->calc(inputs, outputs);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
deleted file mode 100644
index 1327616950a8887efa2cba410fa7ae8b5bd97da4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CostLayer.cpp
+++ /dev/null
@@ -1,748 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CostLayer.h"
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include "paddle/utils/Logging.h"
-
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-
-bool CostLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  bool ret = Layer::init(layerMap, parameterMap);
-  coeff_ = config_.coeff();
-  if (!ret) return ret;
-  CHECK_GE(inputLayers_.size(), 2UL);
-  CHECK_LE(inputLayers_.size(), 3UL);
-  if (inputLayers_.size() == 3) {
-    weightLayer_ = inputLayers_[2];
-  }
-  return true;
-}
-
-void CostLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer())->getHeight();
-  int size = 1;
-  resetOutput(batchSize, size);
-
-  const MatrixPtr& output = getInputValue(*getOutputLayer());
-  Argument label = getInput(*getLabelLayer());
-
-  /* get the cost value for each sample*/
-  forwardImp(*output, label, *getOutputValue());
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    getOutputValue()->dotMul(*getOutputValue(), *weight);
-  }
-}
-
-void CostLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  const Argument& output = getInput(*getOutputLayer());
-  Argument label = getInput(*getLabelLayer());
-
-  bool support = true;
-  if (weightLayer_) {
-    support = output.grad->getAbsSum() == 0;
-  }
-
-  backwardImp(*output.value, label, *output.grad);
-
-  if (weightLayer_) {
-    CHECK(support) << "Weighted cost layer '" << getName()
-                   << "' must be the last layer "
-                      "connected to the output layer '"
-                   << getOutputLayer()->getName() << "'";
-    output.grad->rowScale(0, *output.grad, *getInputValue(*weightLayer_));
-  }
-  if (coeff_ != real(1.0f)) {
-    output.grad->add(coeff_, 0);
-  }
-}
-
-//
-// class MultiClassCrossEntropy
-//
-bool MultiClassCrossEntropy::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiClassCrossEntropy::forwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& target) {
-  target.oneHotCrossEntropy(output, *label.ids);
-}
-
-void MultiClassCrossEntropy::backwardImp(Matrix& output,
-                                         Argument& label,
-                                         Matrix& outputG) {
-  outputG.oneHotCrossEntropyBp(output, *label.ids);
-}
-
-//
-// class MultiClassCrossEntropyWithSelfNorm
-//
-REGISTER_LAYER(multi_class_cross_entropy_with_selfnorm,
-               MultiClassCrossEntropyWithSelfNorm);
-
-bool MultiClassCrossEntropyWithSelfNorm::init(
-    const LayerMap& layerMap, const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
-                                                    Argument& label,
-                                                    Matrix& target) {
-  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
-  output.rowSum(*sftMaxSum_);
-  sftMaxSum_->log2();
-
-  target.oneHotCrossEntropy(output, *label.ids);
-  target.add(*sftMaxSum_);
-
-  sftMaxSum_->square2();
-  target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
-}
-
-void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
-                                                     Argument& label,
-                                                     Matrix& outputG) {
-  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
-  output.rowSum(*sftMaxSum_);
-
-  Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
-  sftMaxSum_->reciprocal2(*sumInv_);
-
-  outputG.oneHotCrossEntropyBp(output, *label.ids);
-  outputG.addColumnVector(*sumInv_);
-
-  sftMaxSum_->log2();
-  sumInv_->dotMul(*sumInv_, *sftMaxSum_);
-  sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());
-
-  outputG.addColumnVector(*sumInv_);
-}
-
-//
-// class SoftBinaryClassCrossEntropy
-//
-REGISTER_LAYER(soft_binary_class_cross_entropy, SoftBinaryClassCrossEntropy);
-
-bool SoftBinaryClassCrossEntropy::init(const LayerMap& layerMap,
-                                       const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output,
-                                             Argument& label,
-                                             Matrix& target) {
-  Matrix::resizeOrCreate(
-      targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
-
-  targetPerDim_->softCrossEntropy(output, *label.value);
-  targetPerDim_->rowSum(target);
-}
-
-void SoftBinaryClassCrossEntropy::backwardImp(Matrix& output,
-                                              Argument& label,
-                                              Matrix& outputG) {
-  outputG.softCrossEntropyBp(output, *label.value);
-}
-
-//
-// class SumOfSquaresCostLayer
-//
-
-REGISTER_LAYER(square_error, SumOfSquaresCostLayer);
-
-bool SumOfSquaresCostLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SumOfSquaresCostLayer::forwardImp(Matrix& output,
-                                       Argument& label,
-                                       Matrix& target) {
-  target.sumOfSquares(output, *label.value);
-}
-
-void SumOfSquaresCostLayer::backwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& outputG) {
-  outputG.sumOfSquaresBp(output, *label.value);
-}
-
-//
-// class SmoothL1CostLayer
-//
-
-REGISTER_LAYER(smooth_l1, SmoothL1CostLayer);
-
-bool SmoothL1CostLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SmoothL1CostLayer::forwardImp(Matrix& output,
-                                   Argument& label,
-                                   Matrix& target) {
-  MatrixPtr targetCpu, outputCpu, labelCpu;
-  if (useGpu_) {
-    targetCpu =
-        Matrix::create(target.getHeight(), target.getWidth(), false, false);
-    outputCpu =
-        Matrix::create(output.getHeight(), output.getWidth(), false, false);
-    labelCpu = Matrix::create(
-        label.value->getHeight(), label.value->getWidth(), false, false);
-    targetCpu->copyFrom(target);
-    outputCpu->copyFrom(output);
-    labelCpu->copyFrom(*label.value);
-    targetCpu->smoothL1(*outputCpu, *labelCpu, 1.0);
-    target.copyFrom(*targetCpu);
-  } else {
-    target.smoothL1(output, *label.value, 1.0);
-  }
-}
-
-void SmoothL1CostLayer::backwardImp(Matrix& output,
-                                    Argument& label,
-                                    Matrix& outputG) {
-  MatrixPtr outputGCpu, outputCpu, labelCpu;
-  if (useGpu_) {
-    outputGCpu =
-        Matrix::create(outputG.getHeight(), outputG.getWidth(), false, false);
-    outputCpu =
-        Matrix::create(output.getHeight(), output.getWidth(), false, false);
-    labelCpu = Matrix::create(
-        label.value->getHeight(), label.value->getWidth(), false, false);
-    outputGCpu->copyFrom(outputG);
-    outputCpu->copyFrom(output);
-    labelCpu->copyFrom(*label.value);
-    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu, 1.0);
-    outputG.copyFrom(*outputGCpu);
-  } else {
-    outputG.smoothL1Bp(output, *label.value, 1.0);
-  }
-}
-
-//
-// class RankingCost
-//
-bool RankingCost::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  posPairCount_ = 0;
-  negPairCount_ = 0;
-
-  bool ret = Layer::init(layerMap, parameterMap);
-  if (!ret) return ret;
-  CHECK_GE(inputLayers_.size(), 3UL);
-  CHECK_LE(inputLayers_.size(), 4UL);
-  if (inputLayers_.size() == 4) {
-    weightLayer_ = inputLayers_[3];
-  }
-  return true;
-}
-
-void RankingCost::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer(0))->getHeight();
-  int size = 1;
-  resizeOutput(batchSize, size);
-  Matrix::resizeOrCreate(margin_, batchSize, size, /* trans= */ false, useGpu_);
-  MatrixPtr label = getInputValue(*getLabelLayer());
-  if (!label) {
-    // input label is not in value, try ids
-    IVectorPtr idLabel = getInput(*getLabelLayer()).ids;
-    CHECK(idLabel) << "label layer has neither value nor ids";
-    CHECK_EQ((size_t)batchSize, idLabel->getSize());
-    Matrix::resizeOrCreate(
-        labelBuf_, batchSize, /*width*/ 1, /*trans*/ false, useGpu_);
-    labelBuf_->copyFrom(*idLabel);
-    label = labelBuf_;
-  }
-
-  MatrixPtr output[] = {getInputValue(*getOutputLayer(0)),
-                        getInputValue(*getOutputLayer(1))};
-  MatrixPtr target = this->getOutputValue();
-  margin_->sub(*output[0], *output[1]);
-
-  // for validation
-  size_t height = output[0]->getHeight();
-  target->biggerThan(*(output[0]), *(output[1]), *label);
-  double total = static_cast<double>(height);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    target->dotMul(*target, *weight);
-    total = weight->getSum();
-  }
-  double pos = target->getSum();
-  posPairCount_ += pos;
-  negPairCount_ += (total - pos);
-
-  // forward
-  target->logisticRegressionLoss(*margin_, *label);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    target->dotMul(*target, *weight);
-  }
-}
-
-void RankingCost::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr label = getInputValue(*getLabelLayer());
-  if (!label) {
-    // input label is not in value, but in ids
-    // use labelBuf_ (should already resized and copied during forward)
-    label = labelBuf_;
-  }
-
-  Matrix::resizeOrCreate(
-      marginGrad_, label->getHeight(), 1, /* trans= */ false, useGpu_);
-  marginGrad_->zeroMem();
-  marginGrad_->logisticRegressionLossBp(*margin_, *label);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    marginGrad_->dotMul(*marginGrad_, *weight);
-  }
-
-  getInputGrad(0)->add(*marginGrad_);
-  getInputGrad(1)->sub(*marginGrad_);
-}
-
-void RankingCost::onPassEnd() {
-  double ratio = posPairCount_ / ((negPairCount_ <= 0) ? 1.0 : negPairCount_);
-  LOG(INFO) << "calc pos/neg: " << ratio << " pos= " << posPairCount_
-            << " neg= " << negPairCount_;
-
-  posPairCount_ = 0;
-  negPairCount_ = 0;
-}
-
-//
-// class LambdaCost
-//
-REGISTER_LAYER(lambda_cost, LambdaCost);
-
-bool LambdaCost::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  truncationSize_ = config_.ndcg_num();
-  maxSortSize_ = config_.max_sort_size();
-  if (maxSortSize_ != -1) {
-    CHECK_GE(maxSortSize_, truncationSize_)
-        << "maxSortSize must be greater than or equal to NDCG size!";
-  }
-  LOG(INFO) << "LambdaRank v1.3, NDCG size = " << truncationSize_
-            << ", Max partial sort size = " << maxSortSize_;
-  CHECK(!useGpu_) << "LambdaRank supports CPU only!";
-  return Layer::init(layerMap, parameterMap);
-}
-
-void LambdaCost::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer())->getHeight();
-  resizeOutput(batchSize, 1);
-
-  MatrixPtr score = getInputValue(*getScoreLayer());
-  MatrixPtr output = getInputValue(*getOutputLayer());
-  MatrixPtr target = this->getOutputValue();
-
-  real* scoreData = score->getData();
-  real* outputData = output->getData();
-  real* targetData = target->getData();
-
-  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
-  const int* startPosData = startPos->getData(false);
-  size_t batchNum = startPos->getSize() - 1;
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    real NDCG = calcNDCG(
-        outputData + beginPos, scoreData + beginPos, endPos - beginPos);
-    for (int j = beginPos; j < endPos; ++j) {
-      targetData[j] = NDCG;
-    }
-  }
-}
-
-void LambdaCost::backward(const UpdateCallback& callback) {
-  (void)callback;
-  MatrixPtr score = getInputValue(*getScoreLayer());
-  MatrixPtr output = getInputValue(*getOutputLayer());
-  Matrix::resizeOrCreate(marginGrad_,
-                         score->getHeight(),
-                         1,
-                         /* trans= */ false,
-                         useGpu_);
-  marginGrad_->zeroMem();
-
-  real* gradData = marginGrad_->getData();
-  real* scoreData = score->getData();
-  real* outputData = output->getData();
-
-  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
-  const int* startPosData = startPos->getData(false);
-  size_t batchNum = startPos->getSize() - 1;
-
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    calcGrad(outputData + beginPos,
-             scoreData + beginPos,
-             gradData + beginPos,
-             endPos - beginPos);
-  }
-
-  getInputGrad(0)->add(*marginGrad_);
-}
-
-void LambdaCost::calcGrad(const real* outputScore,
-                          const real* score,
-                          real* gradData,
-                          int size) {
-  CHECK_GE(size, truncationSize_)
-      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
-  int sortSize = maxSortSize_ == -1 ? size : std::min(maxSortSize_, size);
-
-  scorePair_.clear();
-  for (int i = 0; i < size; ++i) {
-    scorePair_.push_back(std::make_pair(score[i], i));
-  }
-  if (size <= sortSize) {
-    std::sort(scorePair_.begin(),
-              scorePair_.end(),
-              [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-                return a.first > b.first;
-              });
-  } else {
-    std::partial_sort(
-        scorePair_.begin(),
-        scorePair_.begin() + sortSize,
-        scorePair_.end(),
-        [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-          return a.first > b.first;
-        });
-  }
-
-  real maxDCG = 0;
-  for (int i = 0; i < truncationSize_; ++i) {
-    maxDCG += (std::pow(2, scorePair_[i].first) - 1) / std::log(i + 2);
-  }
-  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
-
-  for (int i = 0; i < sortSize; ++i) {
-    for (int j = i + 1; j < size; ++j) {
-      int index_i = scorePair_[i].second;
-      int index_j = scorePair_[j].second;
-      real score_i = score[index_i];
-      real score_j = score[index_j];
-      real dcgDif = 0;
-      if (j < sortSize) {
-        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) *
-                 (1 / std::log(i + 2) - 1 / std::log(j + 2));
-      } else {
-        dcgDif =
-            (std::pow(2, score_i) - std::pow(2, score_j)) / std::log(i + 2);
-      }
-
-      real lambda_ij =
-          -std::abs(dcgDif) /
-          (1 + std::exp(outputScore[index_i] - outputScore[index_j]));
-      gradData[index_i] += lambda_ij / maxDCG;
-      gradData[index_j] -= lambda_ij / maxDCG;
-    }
-  }
-}
-
-real LambdaCost::calcNDCG(const real* outputScore,
-                          const real* score,
-                          int size) {
-  CHECK_GE(size, truncationSize_)
-      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
-
-  outputScorePair_.clear();
-  for (int i = 0; i < size; ++i) {
-    outputScorePair_.push_back(std::make_pair(outputScore[i], i));
-  }
-  std::partial_sort(
-      outputScorePair_.begin(),
-      outputScorePair_.begin() + truncationSize_,
-      outputScorePair_.end(),
-      [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-        return a.first > b.first;
-      });
-
-  real DCG = 0;
-  for (int i = 0; i < truncationSize_; ++i) {
-    DCG +=
-        (std::pow(2, score[outputScorePair_[i].second]) - 1) / std::log(i + 2);
-  }
-
-  scoreVec_.resize(size);
-  std::copy(score, score + size, scoreVec_.begin());
-  real maxDCG = 0;
-  std::partial_sort(scoreVec_.begin(),
-                    scoreVec_.begin() + truncationSize_,
-                    scoreVec_.end(),
-                    std::greater<real>());
-  for (int i = 0; i < truncationSize_; ++i) {
-    maxDCG += (std::pow(2, scoreVec_[i]) - 1) / std::log(i + 2);
-  }
-  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
-
-  return DCG / maxDCG;
-}
-
-//
-// class MultiBinaryLabelCrossEntropy
-//
-
-REGISTER_LAYER(multi_binary_label_cross_entropy, MultiBinaryLabelCrossEntropy);
-
-bool MultiBinaryLabelCrossEntropy::init(const LayerMap& layerMap,
-                                        const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output,
-                                              Argument& label,
-                                              Matrix& target) {
-  MatrixPtr value = nullptr;
-  if (label.ids) {
-    CHECK(!label.value);
-    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
-  } else {
-    CHECK(label.value);
-    value = label.value;
-  }
-
-  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
-      dynamic_cast<GpuSparseMatrix*>(value.get())) {
-    target.multiBinaryLabelCrossEntropy(output, *value);
-  } else {
-    Matrix::resizeOrCreate(
-        targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
-
-    targetPerDim_->binaryLabelCrossEntropy(output, *value);
-    targetPerDim_->rowSum(target);
-  }
-}
-
-void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
-                                               Argument& label,
-                                               Matrix& outputG) {
-  MatrixPtr value = nullptr;
-  if (label.ids) {
-    CHECK(!value);
-    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
-  } else {
-    CHECK(label.value);
-    value = label.value;
-  }
-
-  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
-      dynamic_cast<GpuSparseMatrix*>(value.get())) {
-    outputG.multiBinaryLabelCrossEntropyBp(output, *value);
-  } else {
-    outputG.binaryLabelCrossEntropyBp(output, *value);
-  }
-}
-
-bool HuberCost::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  CostLayer::init(layerMap, parameterMap);
-  if (useGpu_) {
-    tmpCpuInput_.reserve(inputLayers_.size());
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_.push_back(Argument());
-    }
-  }
-  return true;
-}
-
-void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
-  if (useGpu_) {
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(
-          getInput(i), false, HPPL_STREAM_DEFAULT);
-    }
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-}
-
-//
-// Huber loss for robust regression.
-//
-REGISTER_LAYER(huber_regression, HuberRegressionLoss);
-
-bool HuberRegressionLoss::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  HuberCost::init(layerMap, parameterMap);
-  delta_ = config_.delta();
-  return true;
-}
-
-void HuberRegressionLoss::forwardImp(Matrix& output,
-                                     Argument& label,
-                                     Matrix& target) {
-  HuberCost::forwardImp(output, label, target);
-  size_t numSamples = target.getHeight();
-  size_t dim = output.getWidth();
-  CHECK(label.value);
-  CHECK_EQ((*label.value).getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(dim, (*label.value).getWidth());
-  CHECK_EQ(target.getWidth(), (size_t)1);
-
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  real* lbl =
-      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
-  std::vector<real> cost(numSamples, 0);
-  for (size_t i = 0; i < numSamples; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      int index = i * dim + j;
-      real a = std::abs(lbl[index] - out[index]);
-      if (a <= delta_)
-        cost[i] += a * a / 2;
-      else
-        cost[i] += delta_ * (a - delta_ / 2);
-    }
-  }
-  target.copyFrom(cost.data(), numSamples);
-}
-
-void HuberRegressionLoss::backwardImp(Matrix& output,
-                                      Argument& label,
-                                      Matrix& outputG) {
-  size_t numSamples = output.getHeight();
-  size_t dim = output.getWidth();
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  real* lbl =
-      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
-  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      int index = i * dim + j;
-      real a = lbl[index] - out[index];
-      if (std::abs(a) <= delta_)
-        grad[index] += -a;
-      else
-        grad[index] += a > 0 ? -delta_ : delta_;
-    }
-  }
-  if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
-}
-
-//
-// Huber loss for robust 2-classes classification
-//
-REGISTER_LAYER(huber_classification, HuberTwoClassification);
-
-bool HuberTwoClassification::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  return HuberCost::init(layerMap, parameterMap);
-}
-
-void HuberTwoClassification::forwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& target) {
-  HuberCost::forwardImp(output, label, target);
-  size_t numSamples = target.getHeight();
-  CHECK(label.ids);
-  CHECK_EQ((*label.ids).getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), (size_t)1);
-  CHECK_EQ(target.getWidth(), (size_t)1);
-
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  std::vector<real> cost(numSamples, 0);
-  for (size_t i = 0; i < numSamples; ++i) {
-    int y = 2 * lbl[i] - 1;
-    real a = out[i] * y;
-    if (a < -1)
-      cost[i] = -4 * a;
-    else if (a < 1)
-      cost[i] = (1 - a) * (1 - a);
-  }
-  target.copyFrom(cost.data(), numSamples);
-}
-
-void HuberTwoClassification::backwardImp(Matrix& output,
-                                         Argument& label,
-                                         Matrix& outputG) {
-  size_t numSamples = output.getHeight();
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    int y = 2 * lbl[i] - 1;
-    real a = out[i] * y;
-    if (a < -1)
-      grad[i] += -4 * y;
-    else if (a < 1)
-      grad[i] += -2 * (1 - a) * y;
-  }
-  if (useGpu_) outputG.copyFrom(grad, numSamples);
-}
-/**
- * This cost layer compute the sum of its input as loss.
- * \f[
- * o(i) = \sum_{j=1}^D y_{ij}
- * \f]
- */
-class SumCostLayer : public Layer {
- public:
-  explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    if (!ret) return ret;
-    CHECK_EQ(inputLayers_.size(), 1UL);
-    return true;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    const MatrixPtr& input = getInputValue(0);
-
-    /* malloc memory for the output_ if necessary */
-    int batchSize = input->getHeight();
-    int size = 1;
-    resizeOutput(batchSize, size);
-    output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0);
-  }
-
-  void backward(const UpdateCallback& callback = nullptr) override {
-    getInputGrad(0)->add((real)1);
-  }
-};
-
-REGISTER_LAYER(sum_cost, SumCostLayer);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
deleted file mode 100644
index 644450291ee8a308accf7a1fe096332cc8c241dc..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "NormLayer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-MatrixPtr CrossChannelNormLayer::createSampleMatrix(MatrixPtr data,
-                                                    size_t iter,
-                                                    size_t spatialDim) {
-  return Matrix::create(data->getData() + iter * channels_ * spatialDim,
-                        channels_,
-                        spatialDim,
-                        false,
-                        useGpu_);
-}
-
-MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
-                                                     size_t iter,
-                                                     size_t spatialDim) {
-  return Matrix::create(
-      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
-}
-
-bool CrossChannelNormLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK(parameters_[0]);
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  channels_ = conf.channels();
-  scale_.reset(new Weight(channels_, 1, parameters_[0]));
-  return true;
-}
-
-void CrossChannelNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr inV = getInputValue(0);
-
-  size_t batchSize = inV->getHeight();
-  size_t dataDim = inV->getWidth();
-  CHECK_EQ(getSize(), dataDim);
-
-  reserveOutput(batchSize, dataDim);
-  MatrixPtr outV = getOutputValue();
-  size_t spatialDim = dataDim / channels_;
-
-  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
-  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
-  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
-
-  inV->square2(*dataBuffer_);
-  for (size_t i = 0; i < batchSize; i++) {
-    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
-    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
-    MatrixPtr outVTmp = createSampleMatrix(outV, i, spatialDim);
-    MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
-
-    // compute norm.
-    spatialBuffer_->sumCols(*dataTmp, 1, 0);
-    // add eps to avoid overflow
-    spatialBuffer_->add(1e-6);
-    spatialBuffer_->sqrt2(*spatialBuffer_);
-    normTmp->copyFrom(*spatialBuffer_);
-    outVTmp->copyFrom(*inVTmp);
-    outVTmp->divRowVector(*spatialBuffer_);
-    // scale the layer.
-    outVTmp->mulColVector(*scale_->getW());
-  }
-}
-
-void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr outV = getOutputValue();
-
-  size_t batchSize = inG->getHeight();
-  size_t dataDim = inG->getWidth();
-  size_t spatialDim = dataDim / channels_;
-
-  MatrixPtr inGBuffer;
-  Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
-
-  dataBuffer_->dotMul(*outG, *outV);
-  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
-  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
-  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
-  scaleDiff_->zeroMem();
-  for (size_t i = 0; i < batchSize; i++) {
-    MatrixPtr outGTmp = createSampleMatrix(outG, i, spatialDim);
-    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
-    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
-    const MatrixPtr inGTmp = createSampleMatrix(inG, i, spatialDim);
-    const MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
-
-    channelBuffer_->sumRows(*dataTmp, 1, 0);
-    channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
-    // store a / scale[i] in scaleDiff_ temporary
-    scaleDiff_->add(*channelBuffer_, 1.);
-
-    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
-    spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
-    // scale the grad
-    inGBuffer->copyFrom(*inVTmp);
-    inGBuffer->mulRowVector(*spatialBuffer_);
-    // divide by square of norm
-    spatialBuffer_->dotMul(*normTmp, *normTmp);
-    inGBuffer->divRowVector(*spatialBuffer_);
-    // subtract
-    inGBuffer->add(*outGTmp, -1, 1);
-    // divide by norm
-    inGBuffer->divRowVector(*normTmp);
-    // scale the diff
-    inGBuffer->mulColVector(*scale_->getW());
-
-    inGTmp->add(*inGBuffer);
-  }
-  // updata scale
-  if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
-  scale_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
deleted file mode 100644
index 9a29e6a55e95334def2b83dc4a794e07a7fd5154..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CudnnBatchNormLayer.h"
-#include "Layer.h"
-#include "paddle/cuda/include/hl_batch_norm.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
-
-bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
-  CHECK(useGpu_) << "CudnnBatchNorm only support GPU";
-
-  hl_create_tensor_descriptor(&ioDesc_);
-  hl_create_tensor_descriptor(&bnParamDesc_);
-  hl_tensor_reshape(bnParamDesc_, 1, channels_, 1, 1);
-
-  return true;
-}
-
-void CudnnBatchNormLayer::reshape(int batchSize) {
-  hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_ * imageD_, imageW_);
-}
-
-void CudnnBatchNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInputValue(0)->getHeight();
-  calFeatureMapSize();
-  reshape(batchSize);
-  resetOutput(batchSize, getInputValue(0)->getWidth());
-
-  // for testing in training peroid.
-  useGlobalStats_ = (passType == PASS_TEST);
-  if (passType == PASS_TEST && config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-
-  real* input = getInputValue(0)->getData();
-  real* output = getOutputValue()->getData();
-  real* gamma = weight_->getW()->getData();
-  real* beta = biases_->getW()->getData();
-  real* movingMean = movingMean_->getW()->getData();
-  real* movingVar = movingVar_->getW()->getData();
-
-  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
-  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
-
-  if (!useGlobalStats_) {
-    REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
-    real* savedMean = savedMean_->getData();
-    real* savedInvVar = savedInvVar_->getData();
-    hl_batch_norm_forward_training(ioDesc_,
-                                   input,
-                                   ioDesc_,
-                                   output,
-                                   bnParamDesc_,
-                                   gamma,
-                                   beta,
-                                   1.0 - movingAvgFraction_,
-                                   movingMean,
-                                   movingVar,
-                                   eps_,
-                                   savedMean,
-                                   savedInvVar);
-  } else {
-    // used movingMean and movingVar in testing
-    if (batchSize <= 1024) {
-      hl_batch_norm_forward_inference(ioDesc_,
-                                      input,
-                                      ioDesc_,
-                                      output,
-                                      bnParamDesc_,
-                                      gamma,
-                                      beta,
-                                      movingMean,
-                                      movingVar,
-                                      eps_);
-    } else {
-      // There is a limitation in cudnn library.
-      // When the batch size is larger than 1024 in cuDNN v5.1,
-      // the cudnnBatchNormalizationForwardInference will fail.
-      hl_batch_norm_cuda_inference(input,
-                                   output,
-                                   gamma,
-                                   beta,
-                                   movingMean,
-                                   movingVar,
-                                   eps_,
-                                   batchSize,
-                                   channels_,
-                                   imageH_ * imageD_,
-                                   imageW_);
-    }
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  real* input = getInputValue(0)->getData();
-  real* outGrad = getOutputGrad()->getData();
-  real* inGrad = getInputGrad(0)->getData();
-  real* gamma = weight_->getW()->getData();
-  real* savedMean = savedMean_->getData();
-  real* savedInvVar = savedInvVar_->getData();
-
-  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
-  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
-
-  auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
-    Matrix::resizeOrCreate(m, h, w, false, true);
-    m->zeroMem();
-    *p = m->getData();
-  };
-
-  real* gammaGrad = nullptr;
-  real* betaGrad = nullptr;
-  if (weight_->getWGrad()) {
-    gammaGrad = weight_->getWGrad()->getData();
-  } else {
-    create(tmpWGrad_, 1, channels_, &gammaGrad);
-  }
-  if (biases_ && biases_->getWGrad()) {
-    betaGrad = biases_->getWGrad()->getData();
-  } else {
-    create(tmpBiasGrad_, 1, channels_, &betaGrad);
-  }
-
-  hl_batch_norm_backward(ioDesc_,
-                         input,
-                         ioDesc_,
-                         outGrad,
-                         ioDesc_,
-                         inGrad,
-                         bnParamDesc_,
-                         gamma,
-                         gammaGrad,
-                         betaGrad,
-                         eps_,
-                         savedMean,
-                         savedInvVar);
-
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    biases_->getParameterPtr()->incUpdate(callback);
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-CudnnBatchNormLayer::~CudnnBatchNormLayer() {
-  hl_destroy_tensor_descriptor(ioDesc_);
-  hl_destroy_tensor_descriptor(bnParamDesc_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnConvBaseLayer.h b/paddle/gserver/layers/CudnnConvBaseLayer.h
deleted file mode 100644
index 1ee1aa100d8adaed04ce24ee12b5b9af52c14b13..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CudnnConvBaseLayer.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "Projection.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A 2-dimension conv layer implemented by cuDNN. It only
- *        supports GPU mode. We automatic select CudnnConvLayer for GPU
- *        mode and ExpandConvLayer for CPU mode if you set type of "conv".
- *        User also can specfiy type of "exconv" or "cudnn_conv" for
- *        particular type.
- *
- * The config file api is img_conv_layer.
- */
-class CudnnConvBaseLayer : public ConvBaseLayer {
- protected:
-  std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
-  std::vector<std::unique_ptr<Projection>> projections_;
-
-  hl_tensor_descriptor biasDesc_;
-  hl_tensor_descriptor outputDesc_;
-
- public:
-  explicit CudnnConvBaseLayer(const LayerConfig& config)
-      : ConvBaseLayer(config) {}
-
-  ~CudnnConvBaseLayer();
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnPoolLayer.cpp b/paddle/gserver/layers/CudnnPoolLayer.cpp
deleted file mode 100644
index ac6d2168f43590a6acd70f6641ff729327894ea0..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CudnnPoolLayer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-bool CudnnPoolLayer::typeCheck(const std::string &poolType,
-                               hl_pooling_mode_t *mode) {
-  if (poolType == "cudnn-max-pool") {
-    if (mode) {
-      *mode = HL_POOLING_MAX;
-    }
-  } else if (poolType == "cudnn-avg-pool") {
-    if (mode) {
-      *mode = HL_POOLING_AVERAGE;
-    }
-  } else if (poolType == "cudnn-avg-incl-pad-pool") {
-    if (mode) {
-      *mode = HL_POOLING_AVERAGE_INCLUDE_PADDING;
-    }
-  } else {
-    return false;
-  }
-
-  return true;
-}
-
-CudnnPoolLayer::CudnnPoolLayer(const LayerConfig &config) : PoolLayer(config) {
-  const std::string &pool_type = config.inputs(0).pool_conf().pool_type();
-  CHECK_EQ(CudnnPoolLayer::typeCheck(pool_type, &mode_), true);
-}
-
-bool CudnnPoolLayer::init(const LayerMap &layerMap,
-                          const ParameterMap &parameterMap) {
-  PoolLayer::init(layerMap, parameterMap);
-
-  CHECK(useGpu_) << "CudnnPoolLayer only support gpu";
-
-  hl_create_tensor_descriptor(&inputDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-
-  windowHeight = sizeY_;
-  windowWidth = sizeX_;
-  heightPadding = confPaddingY_;
-  widthPadding = confPadding_;
-  strideHeight = strideY_;
-  strideWidth = stride_;
-
-  hl_create_pooling_descriptor(&poolingDesc_,
-                               mode_,
-                               windowHeight,
-                               windowWidth,
-                               heightPadding,
-                               widthPadding,
-                               strideHeight,
-                               strideWidth);
-
-  return true;
-}
-
-void CudnnPoolLayer::reshape(int batchSize) {
-  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imageH_ == 0) {
-    imageH_ = imgSizeY_;
-  }
-  if (imageW_ == 0) {
-    imageW_ = imgSize_;
-  }
-  CHECK_EQ(inputLayers_[0]->getOutput().value->getWidth(),
-           channels_ * imageH_ * imageW_);
-  outputH_ = outputSize(imageH_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputW_ =
-      outputSize(imageW_, sizeX_, confPadding_, stride_, /* caffeMode */ false);
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-
-  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_);
-  hl_tensor_reshape(outputDesc_, batchSize, channels_, outputH_, outputW_);
-}
-
-void CudnnPoolLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  CHECK(inputLayers_[0]->getOutputValue()->useGpu());
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  reshape(batchSize);
-  resetOutput(batchSize, outputH_ * outputW_ * channels_);
-
-  real *inputData = getInputValue(0)->getData();
-  real *outData = getOutputValue()->getData();
-  hl_pooling_forward(inputDesc_, inputData, outputDesc_, outData, poolingDesc_);
-}
-
-void CudnnPoolLayer::backward(const UpdateCallback &callback) {
-  (void)callback;
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  real *inputData = getInputValue(0)->getData();
-  real *inputGrad = getInputGrad(0)->getData();
-  real *outData = getOutputValue()->getData();
-  real *outGrad = getOutputGrad()->getData();
-  hl_pooling_backward(inputDesc_,
-                      inputData,
-                      inputGrad,
-                      outputDesc_,
-                      outData,
-                      outGrad,
-                      poolingDesc_);
-}
-
-CudnnPoolLayer::~CudnnPoolLayer() {
-  hl_destroy_tensor_descriptor(inputDesc_);
-  hl_destroy_tensor_descriptor(outputDesc_);
-  hl_destroy_pooling_descriptor(poolingDesc_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h
deleted file mode 100644
index 7ae67a877b488c8d197896b8b1e3e90057fbe1c9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DataNormLayer.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for data normalization
- * - Input: One and only one input layer is accepted. The input layer must
- *        be DataLayer with dense data type.
- * - Output: The normalization of the input data
- *
- * Reference:
- *    LA Shalabi, Z Shaaban, B Kasasbeh. Data mining: A preprocessing engine
- *
- * Three data normalization methoeds are considered
- * - z-score: y = (x-mean)/std
- * - min-max: y = (x-min)/(max-min)
- * - decimal-scaling: y = x/10^j, where j is the smallest integer such that
- *max(|y|)<1
- */
-
-class DataNormLayer : public Layer {
- public:
-  enum NormalizationStrategy { kZScore = 0, kMinMax = 1, kDecimalScaling = 2 };
-
-  explicit DataNormLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~DataNormLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  int mode_;
-  std::unique_ptr<Weight> weight_;
-  MatrixPtr min_;
-  MatrixPtr rangeReciprocal_;  // 1/(max-min)
-  MatrixPtr mean_;
-  MatrixPtr stdReciprocal_;      // 1/std
-  MatrixPtr decimalReciprocal_;  // 1/10^j
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DeConv3DLayer.h b/paddle/gserver/layers/DeConv3DLayer.h
deleted file mode 100644
index 13d1d07cf5cc6e2a6ea89768e29b1fe8cda5e81c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DeConv3DLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of deconvolution3D layer.
- * This layer expands input and use matrix multiplication to
- * calculate deconvolution3D operation.
- */
-class DeConv3DLayer : public ConvBaseLayer {
- public:
-  explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-  ~DeConv3DLayer() {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void addBias();
-  void backward(const UpdateCallback& callback);
-  void bpropBiases();
-  void bpropData(int i);
-  void bpropWeights(int i);
-  size_t getSize();
-
- protected:
-  // Figure out the dimensions for individual gemms.
-  IntV M_;  /// numFilters_ / filter_group_;
-  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
-  IntV K_;  /// outputD_ * outputH_ * outputW_
-  IntV NOut_;
-  MatrixPtr colBuf_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionUtil.h b/paddle/gserver/layers/DetectionUtil.h
deleted file mode 100644
index d6502fcf8fb12a434632876c25ac3ca23b87e60e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DetectionUtil.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <float.h>
-#include <algorithm>
-#include <vector>
-#include "paddle/math/Matrix.h"
-
-using std::vector;
-using std::pair;
-using std::map;
-
-namespace paddle {
-
-template <typename T>
-struct BBoxBase {
-  BBoxBase(T xMin, T yMin, T xMax, T yMax)
-      : xMin(xMin), yMin(yMin), xMax(xMax), yMax(yMax), isDifficult(false) {}
-
-  BBoxBase() {}
-
-  T getWidth() const { return xMax - xMin; }
-
-  T getHeight() const { return yMax - yMin; }
-
-  T getCenterX() const { return (xMin + xMax) / 2; }
-
-  T getCenterY() const { return (yMin + yMax) / 2; }
-
-  T getArea() const { return getWidth() * getHeight(); }
-
-  // coordinate of bounding box
-  T xMin;
-  T yMin;
-  T xMax;
-  T yMax;
-  // whether difficult object (e.g. object with heavy occlusion is difficult)
-  bool isDifficult;
-};
-
-struct NormalizedBBox : BBoxBase<real> {
-  NormalizedBBox() : BBoxBase<real>() {}
-};
-
-enum PermMode { kNCHWToNHWC, kNHWCToNCHW };
-
-/**
- * @brief First permute input maxtrix then append to output matrix
- */
-size_t appendWithPermute(const Matrix& inMatrix,
-                         size_t height,
-                         size_t width,
-                         size_t outTotalSize,
-                         size_t outOffset,
-                         size_t batchSize,
-                         Matrix& outMatrix,
-                         PermMode permMode);
-
-/**
- * @brief First permute input maxtrix then decompose to output
- */
-size_t decomposeWithPermute(const Matrix& inMatrix,
-                            size_t height,
-                            size_t width,
-                            size_t totalSize,
-                            size_t offset,
-                            size_t batchSize,
-                            Matrix& outMatrix,
-                            PermMode permMode);
-
-/**
- * @brief Compute jaccard overlap between two bboxes.
- * @param bbox1 The first bbox
- * @param bbox2 The second bbox
- */
-real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
-
-/**
- * @brief Compute offset parameters between prior bbox and ground truth bbox
- * and variances of prior bbox are considered
- * @param priorBBox Input prior bbox
- * @param priorBBoxVar Variance parameters of prior bbox
- * @param gtBBox Groundtruth bbox
- * @param outVec Output vector
- */
-void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
-                       const vector<real>& priorBBoxVar,
-                       const NormalizedBBox& gtBBox,
-                       vector<real>& outVec);
-
-/**
- * @brief Decode prior bbox with offset parameters
- * and variances of prior bbox are considered
- * @param priorBBox Prior bbox to be decoded
- * @param priorBBoxVar Variance parameters of prior bbox
- * @param locPredData Offset parameters
- */
-NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
-                                 const vector<real>& priorBBoxVar,
-                                 const vector<real>& locPredData);
-
-/**
- * @brief Extract bboxes from prior matrix, the layout is
- * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
- * @param priorData Matrix of prior value
- * @param numBBoxes Number of bbox to be extracted
- * @param bboxVec Append to the vector
- */
-void getBBoxFromPriorData(const real* priorData,
-                          const size_t numBBoxes,
-                          vector<NormalizedBBox>& bboxVec);
-
-/**
- * @brief Extract labels, scores and bboxes from detection matrix, the layout is
- * imageId | label | score | xmin | ymin | xmax | ymax
- * @param detectData Matrix of detection value
- * @param numBBoxes Number of bbox to be extracted
- * @param labelVec Label of bbox
- * @param scoreVec Score of bbox
- * @param bboxVec Append to the vector
- */
-void getBBoxFromDetectData(const real* detectData,
-                           const size_t numBBoxes,
-                           vector<real>& labelVec,
-                           vector<real>& scoreVec,
-                           vector<NormalizedBBox>& bboxVec);
-
-/**
- * @brief Extract variances from prior matrix, the layout is
- * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
- * @param priorData Matrix of prior value
- * @param num Number to be extracted
- * @param varVec Append to the vector
- */
-void getBBoxVarFromPriorData(const real* priorData,
-                             const size_t num,
-                             vector<vector<real>>& varVec);
-
-/**
- * @brief Extract bboxes from label matrix, the layout is
- * class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...
- * @param labelData Matrix of label value
- * @param numBBoxes Number to be extracted
- * @param bboxVec Append to the vector
- */
-void getBBoxFromLabelData(const real* labelData,
-                          const size_t numBBoxes,
-                          vector<NormalizedBBox>& bboxVec);
-
-/**
-* @brief Match prior bbox to groundtruth bbox, the strategy is:
-1. Find the most overlaped bbox pair (prior and groundtruth)
-2. For rest of prior bboxes find the most overlaped groundtruth bbox
-* @param priorBBoxes prior bbox
-* @param gtBBoxes groundtruth bbox
-* @param overlapThreshold Low boundary of overlap (judge whether matched)
-* @param matchIndices For each prior bbox, groundtruth bbox index if matched
-otherwise -1
-* @param matchOverlaps For each prior bbox, overap with all groundtruth bboxes
-*/
-void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
-               const vector<NormalizedBBox>& gtBBoxes,
-               real overlapThreshold,
-               vector<int>* matchIndices,
-               vector<real>* matchOverlaps);
-
-/**
-* @brief Generate positive bboxes and negative bboxes,
-|positive bboxes|/|negative bboxes| is negPosRatio
-* @param priorValue Prior value
-* @param numPriorBBoxes Number of prior bbox
-* @param gtValue Groundtruth value
-* @param gtStartPosPtr Since groundtruth value stored as sequence type,
-this parameter indicates start position of each record
-* @param seqNum Number of sequence
-* @param maxConfScore Classification score for prior bbox, used to mine
-negative examples
-* @param batchSize Image number
-* @param overlapThreshold Low boundary of overap
-* @param negOverlapThreshold Upper boundary of overap (judge negative example)
-* @param negPosRatio Control number of negative bboxes
-* @param matchIndicesVecPtr Save indices of matched prior bbox
-* @param negIndicesVecPtr Save indices of negative prior bbox
-*/
-pair<size_t, size_t> generateMatchIndices(
-    const Matrix& priorValue,
-    const size_t numPriorBBoxes,
-    const Matrix& gtValue,
-    const int* gtStartPosPtr,
-    const size_t seqNum,
-    const vector<vector<real>>& maxConfScore,
-    const size_t batchSize,
-    const real overlapThreshold,
-    const real negOverlapThreshold,
-    const size_t negPosRatio,
-    vector<vector<int>>* matchIndicesVecPtr,
-    vector<vector<int>>* negIndicesVecPtr);
-
-/**
- * @brief Get max confidence score for each prior bbox
- * @param confData Confidence scores, layout is
- * class1 score | class2 score | ... | classN score ...
- * @param batchSize Image number
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Classes number
- * @param backgroundId Background id
- * @param maxConfScoreVecPtr Ouput
- */
-void getMaxConfidenceScores(const real* confData,
-                            const size_t batchSize,
-                            const size_t numPriorBBoxes,
-                            const size_t numClasses,
-                            const size_t backgroundId,
-                            vector<vector<real>>* maxConfScoreVecPtr);
-
-template <typename T>
-bool sortScorePairDescend(const pair<real, T>& pair1,
-                          const pair<real, T>& pair2);
-
-template <>
-bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
-                          const pair<real, NormalizedBBox>& pair2);
-
-/**
- * @brief Do NMS for bboxes to remove duplicated bboxes
- * @param bboxes BBoxes to apply NMS
- * @param confScoreData Confidence scores
- * @param classIdx Class to do NMS
- * @param topK Number to keep
- * @param confThreshold Low boundary of confidence score
- * @param nmsThreshold Threshold of overlap
- * @param numPriorBBoxes Total number of prior bboxes
- * @param numClasses Total class number
- * @param indices Indices of high quality bboxes
- */
-void applyNMSFast(const vector<NormalizedBBox>& bboxes,
-                  const real* confScoreData,
-                  size_t classIdx,
-                  size_t topK,
-                  real confThreshold,
-                  real nmsThreshold,
-                  size_t numPriorBBoxes,
-                  size_t numClasses,
-                  vector<size_t>* indices);
-
-/**
- * @brief Get detection results which satify requirements
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Class number
- * @param backgroundId Background class
- * @param batchSize Image number
- * @param confThreshold Threshold of class confidence
- * @param nmsTopK Used in NMS operation to keep top k bbox
- * @param nmsThreshold Used in NMS, threshold of overlap
- * @param keepTopK How many bboxes keeped in an image
- * @param allDecodedBBoxes Decoded bboxes for all images
- * @param allDetectionIndices Save detection bbox indices
- */
-size_t getDetectionIndices(
-    const real* confData,
-    const size_t numPriorBBoxes,
-    const size_t numClasses,
-    const size_t backgroundId,
-    const size_t batchSize,
-    const real confThreshold,
-    const size_t nmsTopK,
-    const real nmsThreshold,
-    const size_t keepTopK,
-    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
-    vector<map<size_t, vector<size_t>>>* allDetectionIndices);
-
-/**
- * @brief Get detection results
- * @param confData Confidence scores
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Class number
- * @param batchSize Image number
- * @param allIndices Indices of predicted bboxes
- * @param allDecodedBBoxes BBoxes decoded
- * @param out Output matrix
- * image number | label | confidence score | xMin | yMin | xMax | yMax
- */
-void getDetectionOutput(const real* confData,
-                        const size_t numKept,
-                        const size_t numPriorBBoxes,
-                        const size_t numClasses,
-                        const size_t batchSize,
-                        const vector<map<size_t, vector<size_t>>>& allIndices,
-                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
-                        Matrix& out);
-
-NormalizedBBox clipBBox(const NormalizedBBox& bbox);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DotProdLayer.cpp b/paddle/gserver/layers/DotProdLayer.cpp
deleted file mode 100644
index 72b0c707b2131dc275ba604cd20ae0007c34a9a9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DotProdLayer.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for computing the dot product of two vectors.
- * Input1: vector (batchSize * dim)
- * Input2: vector (batchSize * dim)
- * Output: a matrix: (batchSize * 1)
- */
-
-class DotProdLayer : public Layer {
- public:
-  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~DotProdLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(dot_prod, DotProdLayer);
-
-bool DotProdLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-  CHECK_EQ(1UL, getSize())
-      << "The output dimensionality of this layer should be fixed to 1.";
-
-  return true;
-}
-
-void DotProdLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  CHECK_EQ(inV1->getHeight(), batchSize);
-  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, 1);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
-    outV->sumOfProducts(*inV0, *inV1, 1, 0);
-  }
-}
-
-void DotProdLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  {
-    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
-
-    if (inG0) {
-      inG0->addRowScale(0, *inV1, *outG);
-    }
-
-    if (inG1) {
-      inG1->addRowScale(0, *inV0, *outG);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
deleted file mode 100644
index 6919ef71355a4c660b9ddd60bff75fee399cfaa9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of convolution layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution operation.
- *
- * The config file api is img_conv_layer.
- */
-
-class ExpandConvLayer : public ConvBaseLayer {
- public:
-  explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-
-  ~ExpandConvLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  size_t getOutputSize();
-
- protected:
-  std::vector<TensorShape> inputShape_;
-  std::vector<TensorShape> filterShape_;
-  std::vector<TensorShape> outputShape_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h
deleted file mode 100644
index 06bd4ef05ee206628d981fee8e7eec3c91b18b7a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ExpandLayer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer for "Expand Dense data or (sequence data where the length of each
- * sequence is one) to sequence data."
- *
- * It should have exactly 2 input, one for data, one for size:
- * - first one for data
- *   - If ExpandLevel = kNonSeq: dense data
- *   - If ExpandLevel = kSeq: sequence data where the length of each sequence is
- * one
- * - second one only for sequence info
- *   - should be sequence data with or without sub-sequence.
- *
- * And the output size is the batch size(not instances) of second input.
- *
- * The config file api is expand_layer.
- */
-
-class ExpandLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-  /// if input[0] is dense data, ExpandLevel=kNonSeq;
-  /// if input[0] is sequence data, ExpandLevel=kSeq
-  enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
-  /// store the ExpandLevel
-  int type_;
-  /// expanded sequenceStartPositions or subSequenceStartPositions
-  /// of input[1]
-  ICpuGpuVectorPtr expandStartsPos_;
-
- public:
-  explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
deleted file mode 100644
index 1744faada2ebd9f2c88ba9a3952b6b2646729e3b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FactorizationMachineLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
-
-bool FactorizationMachineLayer::init(const LayerMap& layerMap,
-                                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  factorSize_ = config_.factor_size();
-
-  /* initialize the latentVectors_ */
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t inputSize = inputLayers_[0]->getSize();
-  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
-  latentVectors_ = std::unique_ptr<Weight>(
-      new Weight(inputSize, factorSize_, parameters_[0]));
-
-  return true;
-}
-
-void FactorizationMachineLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const MatrixPtr& inputV = getInputValue(0);
-
-  size_t batchSize = inputV->getHeight();
-  size_t outputSize = getSize();
-  size_t inputSize = inputLayers_[0]->getSize();
-  reserveOutput(batchSize, outputSize);
-
-  MatrixPtr outV = getOutputValue();
-
-  Matrix::resizeOrCreate(
-      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
-  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
-
-  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
-  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
-  inputMulFactor_->square2(*tmpOut_);
-  outV->sumRows(*tmpOut_, 0.5, 0);
-
-  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
-    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
-                                       inputV->getHeight(),
-                                       inputV->getWidth(),
-                                       inputV->getElementCnt(),
-                                       inputV->getValueType());
-    inputSquare_->copyFrom(*inputV);
-    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
-  } else {
-    Matrix::resizeOrCreate(
-        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
-    inputV->square2(*inputSquare_);
-  }
-  latentVectors_->getW()->square2(*latentVectorsSquare_);
-  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
-  outV->sumRows(*tmpOut_, -0.5, 1.0);
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  const MatrixPtr& inputV = getInputValue(0);
-  const MatrixPtr& oGrad = getOutputGrad();
-
-  Matrix::resizeOrCreate(
-      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
-  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
-                                         latentVectors_->getW()->getHeight(),
-                                         1,
-                                         false,
-                                         useGpu_);
-
-  /* Calculate the gradients of the latentVectors_ matrix */
-  if (latentVectors_->getWGrad()) {
-    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
-      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
-                                         inputV->getHeight(),
-                                         inputV->getWidth(),
-                                         inputV->getElementCnt());
-
-      CpuSparseMatrix* sparseInputV =
-          dynamic_cast<CpuSparseMatrix*>(inputV.get());
-      CpuSparseMatrix* sparseInputSquare =
-          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
-      CpuSparseMatrix* sparseTmpInput =
-          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
-      sparseTmpInput->copyFrom(*sparseInputV);
-
-      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
-      latentVectors_->getWGrad()->mul(
-          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
-      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
-
-      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
-      negOnes_->zeroMem();
-      negOnes_->add(-1);
-      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
-    } else {
-      Matrix::resizeOrCreate(
-          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
-
-      tmpInput_->rowScale(0, *inputV, *oGrad);
-      latentVectors_->getWGrad()->mul(
-          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
-      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
-
-      tmpSum_->sumCols(*tmpInput_, -1, 0);
-    }
-
-    latentVectors_->getWGrad()->addRowScale(
-        0, *latentVectors_->getW(), *tmpSumTrans);
-
-    /* Increasing the number of gradient */
-    latentVectors_->getParameterPtr()->incUpdate(callback);
-  }
-
-  /* Calculate the input layers gradient */
-  MatrixPtr inGrad = getInputGrad(0);
-  if (inGrad != NULL) {
-    inGrad->mul(
-        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
-    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
-    inGrad->addColScale(0, *inputV, *tmpSum_);
-    inGrad->rowScale(0, *inGrad, *oGrad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
deleted file mode 100644
index 148abe238173dd44cd0fcf3f5cda732f70078706..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * @brief The Factorization Machine models pairwise (order-2) feature
- * interactions as inner product of the learned latent vectors corresponding
- * to each input feature.
- *
- * The Factorization Machine can effectively capture feature interactions
- * especially when the input is sparse. While in principle FM can model higher
- * order feature interaction, in practice usually only order-2 feature
- * interactions are considered. The Factorization Machine Layer here only
- * computes the order-2 interations with the formula:
- *
- * \f[
- *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
- * \f]
- *
- * The detailed calculation for forward and backward can be found at this paper:
- *
- *     Factorization machines.
- *
- * The config file api is factorization_machine.
- */
-
-class FactorizationMachineLayer : public Layer {
- protected:
-  // The latent vectors, shape: (size, factorSize_)
-  // Each row of the latentVectors_ matrix is the latent vector
-  // corresponding to one input feature dimension
-  std::unique_ptr<Weight> latentVectors_;
-  // The hyperparameter that defines the dimensionality of the factorization
-  size_t factorSize_;
-
- private:
-  // Store the square values of the letent vectors matrix
-  MatrixPtr latentVectorsSquare_;
-  // Store the square values of input matrix
-  MatrixPtr inputSquare_;
-  // The result of input matrix * latent vector matrix that will be used in
-  // both forward and backward step
-  MatrixPtr inputMulFactor_;
-  // Store temporary calculation result
-  MatrixPtr tmpOut_;
-  MatrixPtr tmpSum_;
-  MatrixPtr tmpInput_;
-  // Negative identity matrix
-  MatrixPtr negOnes_;
-
- public:
-  explicit FactorizationMachineLayer(const LayerConfig& config)
-      : Layer(config) {}
-  ~FactorizationMachineLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
deleted file mode 100644
index d95f0b9b3d13e8bff635373cb4d5705c2351bd97..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for expanding a batch of images to feature maps.
- * Each data of the input is a 2 dimensional matrix. Each element of the matrix
- * is replicated num_filters times to create a feature map with num_filters
- * channels.
- * - Input: Input one should be dense image data.
- * - Output: expanded fature maps.
- * \f[
- *  y.row[i] = x.row[i \mod x.width], i = 0,1,..., (x.width * num\_filters - 1)
- * \f]
- * For example, num_filters = 4:
- * @code
- *   x = [a1,a2;
- *        b1,b2]
- *   y = [a1, a2, a1, a2, a1, a2, a1, a2;
- *        b1, b2, b1, b2, b1, b2, b1, b2;]
- * @endcode
- */
-
-class FeatureMapExpandLayer : public Layer {
- private:
-  int numFilters_;
-  bool asRowVector_;
-
- public:
-  explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~FeatureMapExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(featmap_expand, FeatureMapExpandLayer);
-
-bool FeatureMapExpandLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  numFilters_ = config_.num_filters();
-  asRowVector_ = config_.user_arg() != "as_col_vec";
-  return true;
-}
-
-void FeatureMapExpandLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr inputV = getInputValue(0);
-  size_t batchSize = getInput(0).getBatchSize();
-  int imgSize = inputV->getWidth();
-  resetOutput(batchSize, imgSize * numFilters_);
-
-  MatrixPtr outputV = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    if (asRowVector_) {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outVTmp =
-            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                           numFilters_,
-                           imgSize,
-                           false,
-                           useGpu_);
-        MatrixPtr inVTmp = Matrix::create(
-            inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-        outVTmp->addRowVector(*inVTmp);
-      }
-    } else {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outVTmp =
-            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                           imgSize,
-                           numFilters_,
-                           false,
-                           useGpu_);
-        MatrixPtr inVTmp = Matrix::create(
-            inputV->getData() + i * imgSize, imgSize, 1, false, useGpu_);
-        outVTmp->addColVector(*inVTmp);
-      }
-    }
-  }
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inGrad = getInputGrad(0);
-  if (NULL == inGrad) {
-    return;
-  }
-  MatrixPtr outGrad = getOutputGrad();
-  size_t batchSize = getInput(0).getBatchSize();
-  int imgSize = inGrad->getWidth();
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    if (asRowVector_) {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outGradTmp =
-            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                           numFilters_,
-                           imgSize,
-                           false,
-                           useGpu_);
-        MatrixPtr inGradTmp = Matrix::create(
-            inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-        inGradTmp->collectBias(*outGradTmp, 1);
-      }
-    } else {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outGradTmp =
-            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                           imgSize,
-                           numFilters_,
-                           false,
-                           useGpu_);
-        MatrixPtr inGradTmp = Matrix::create(
-            inGrad->getData() + i * imgSize, imgSize, 1, false, useGpu_);
-        inGradTmp->sumRows(*outGradTmp, 1, 1);
-      }
-    }
-  }
-}
-
-}  // namespace paddle.
diff --git a/paddle/gserver/layers/FullyConnectedLayer.cpp b/paddle/gserver/layers/FullyConnectedLayer.cpp
deleted file mode 100644
index 21ffa01d95a460b4b6edc2b02d63c19b32d0b070..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FullyConnectedLayer.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FullyConnectedLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(fc, FullyConnectedLayer);
-
-bool FullyConnectedLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize the weightList */
-  CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    // Option the parameters
-    size_t height = inputLayers_[i]->getSize();
-    size_t width = getSize();
-
-    // create a new weight
-    if (parameters_[i]->isSparse()) {
-      CHECK_LE(parameters_[i]->getSize(), width * height);
-    } else {
-      CHECK_EQ(parameters_[i]->getSize(), width * height);
-    }
-    Weight* w = new Weight(height, width, parameters_[i]);
-
-    // append the new weight to the list
-    weights_.emplace_back(w);
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  return true;
-}
-
-void FullyConnectedLayer::prefetch() {
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    auto* sparseParam =
-        dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
-    if (sparseParam) {
-      MatrixPtr input = getInputValue(i);
-      sparseParam->addRows(input);
-    }
-  }
-}
-
-void FullyConnectedLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    auto input = getInput(i);
-    CHECK(input.value) << "The input of 'fc' layer must be matrix";
-    REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-    i == 0 ? outV->mul(*input.value, *weights_[i]->getW(), 1, 0)
-           : outV->mul(*input.value, *weights_[i]->getW(), 1, 1);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FullyConnectedLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  bool syncFlag = hl_get_sync_flag();
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    /* Calculate the W-gradient for the current layer */
-    if (weights_[i]->getWGrad()) {
-      MatrixPtr input_T = getInputValue(i)->getTranspose();
-      MatrixPtr oGrad = getOutputGrad();
-      {
-        REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-        weights_[i]->getWGrad()->mul(*input_T, *oGrad, 1, 1);
-      }
-    }
-
-    // If callback does not change value, backprop error asynchronously so that
-    // we can do the callback concurrently.
-    hl_set_sync_flag(false);
-
-    /* Calculate the input layers error */
-    MatrixPtr preGrad = getInputGrad(i);
-    if (NULL != preGrad) {
-      MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
-      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-      preGrad->mul(*getOutputGrad(), *weights_T, 1, 1);
-    }
-
-    hl_set_sync_flag(syncFlag);
-    {
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h
deleted file mode 100644
index e0f9d6ce55fbdf73e5507032c108c735bf04597b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * A layer has full connections to all neurons in the previous layer.
- * It computes an inner product with a set of learned weights, and
- * (optionally) adds biases.
- *
- * The config file api is fc_layer.
- */
-
-class FullyConnectedLayer : public Layer {
- protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
-  ~FullyConnectedLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  void prefetch() override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
deleted file mode 100644
index 46508dc977bf1a6fd33dc1fb024bd1aed36a0ff3..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "GruCompute.h"
-#include "Layer.h"
-#include "SequenceToBatch.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Please refer to "Junyoung Chung, Empirical Evaluation
- * of Gated Recurrent Neural Networks on Sequence Modeling".
- *
- * GatedRecurrentLayer takes 1 input layer with size * 3.
- * Input layer is diveded into 3 equal parts: (xz_t, xr_t, xi_t).
- * parameter and biasParameter is also diveded into 3 equal parts:
- *   - parameter consists of (U_z, U_r, U)
- *   - baisParameter consists of (bias_z, bias_r, bias_o)
- *
- * \f[
- * update \ gate: z_t = actGate(xz_t + U_z * h_{t-1} + bias_z) \\
- * reset \ gate: r_t = actGate(xr_t + U_r * h_{t-1} + bias_r) \\
- * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, h_{t-1}) + bias_o) \\
- * hidden \ activation: h_t = dot((1-z_t), h_{t-1}) + dot(z_t, {h}_t) \\
- * \f]
- *
- * @note
- * - dot denotes "element-wise multiplication".
- * - actNode is defined by config active_type
- * - actGate is defined by config actvie_gate_type
- *
- * The config file is grumemory.
- */
-
-class GatedRecurrentLayer : public Layer, public GruCompute {
- public:
-  explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
- protected:
-  void forwardSequence(int batchSize,
-                       size_t numSequences,
-                       const int* starts,
-                       MatrixPtr inputValue);
-  void backwardSequence(int batchSize,
-                        size_t numSequences,
-                        const int* starts,
-                        MatrixPtr inputGrad);
-
-  void forwardBatch(int batchSize,
-                    size_t numSequences,
-                    const int* starts,
-                    MatrixPtr inputValue);
-  void backwardBatch(int batchSize, MatrixPtr inputGrad);
-
- protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> gateWeight_;
-  std::unique_ptr<Weight> stateWeight_;
-  std::unique_ptr<Weight> bias_;
-
-  Argument gate_;
-  Argument resetOutput_;
-
-  bool reversed_;
-  bool useBatch_;
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-  std::unique_ptr<ActivationFunction> activationGate_;
-
-  MatrixPtr prevOutput_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.cpp b/paddle/gserver/layers/GruCompute.cpp
deleted file mode 100644
index 48ddbc413e6c915be6e86704f96e919932ca2970..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/GruCompute.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GruCompute.h"
-#include "hl_recurrent_apply.cuh"
-#include "paddle/function/GruFunctor.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-void GruCompute::init(LayerConfig &config) {
-  activeNode_ = hlActiveType(config.active_type());
-  activeGate_ = hlActiveType(config.active_gate_type());
-}
-
-template <>
-void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
-  GruFunctor<DEVICE_TYPE_CPU, real>::compute(hppl::forward::gru_resetOutput(),
-                                             hppl::forward::gru_finalOutput(),
-                                             value,
-                                             frameSize,
-                                             batchSize,
-                                             activeNode_,
-                                             activeGate_);
-}
-
-template <>
-void GruCompute::backward<0>(hl_gru_value value,
-                             hl_gru_grad grad,
-                             int frameSize,
-                             int batchSize) {
-  GruGradFunctor<DEVICE_TYPE_CPU, real>::compute(
-      hppl::backward::gru_stateGrad(),
-      hppl::backward::gru_resetGrad(),
-      value,
-      grad,
-      frameSize,
-      batchSize,
-      activeNode_,
-      activeGate_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
deleted file mode 100644
index 509c07cf22c9bcbe9283241b38540162b3dbe26b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for linear interpolation with two inputs,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
- * \f]
- * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs,
- * \f$w\f$ is (batchSize x 1) weight vector,
- * and \f$y\f$ is (batchSize x dataDim) output.
- *
- * The config file api is interpolation_layer.
- */
-
-class InterpolationLayer : public Layer {
- protected:
-  /// weightLast = 1 - weight
-  MatrixPtr weightLast_;
-  MatrixPtr tmpMatrix;
-
- public:
-  explicit InterpolationLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~InterpolationLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(interpolation, InterpolationLayer);
-
-bool InterpolationLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(3U, inputLayers_.size());
-
-  return true;
-}
-
-void InterpolationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inV2 = getInputValue(2);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(dataDim, getSize());
-  CHECK_EQ(dataDim, inV2->getWidth());
-  CHECK_EQ(batchSize, inV1->getHeight());
-  CHECK_EQ(batchSize, inV2->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  Matrix::resizeOrCreate(weightLast_, batchSize, 1, false, useGpu_);
-  weightLast_->one();
-  weightLast_->sub(*weightV);
-
-  REGISTER_TIMER_INFO("FwInterpTimer", getName().c_str());
-  // outV = inV1 * weight + inV2 * weightLast
-  outV->addRowScale(0, *inV1, *weightV);
-  outV->addRowScale(0, *inV2, *weightLast_);
-}
-
-void InterpolationLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inV2 = getInputValue(2);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr inG2 = getInputGrad(2);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  REGISTER_TIMER_INFO("BwInterpTimer", getName().c_str());
-
-  if (inG0) {
-    Matrix::resizeOrCreate(tmpMatrix, batchSize, dataDim, false, useGpu_);
-
-    // inG0 += outG .* (inV1 - inV2)
-    tmpMatrix->sub(*inV1, *inV2);
-    inG0->rowDotMul(0, *outG, *tmpMatrix);
-  }
-
-  if (inG1) {
-    // inG1 += outG * weight
-    inG1->addRowScale(0, *outG, *weightV);
-  }
-
-  if (inG2) {
-    // inG2 += outG * weightLast
-    inG2->addRowScale(0, *outG, *weightLast_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/L2DistanceLayer.h b/paddle/gserver/layers/L2DistanceLayer.h
deleted file mode 100644
index 44e688e1377145845033d9d5cc3f31f5594a11f6..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/L2DistanceLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief The layer calculates the l2 distance between two input vectors.
- * \f[
- * f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
- * \f]
- *
- * - Input1: A vector (batchSize * dataDim)
- * - Input2: A vector (batchSize * dataDim)
- * - Output: A vector (batchSize * 1)
- *
- * The configuration api is: l2_distance_layer.
- */
-
-class L2DistanceLayer : public Layer {
- public:
-  explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
-  ~L2DistanceLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  // Store the result of subtracting Input2 from Input1 in forward computation,
-  // which will be reused in backward computation.
-  MatrixPtr inputSub_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
deleted file mode 100644
index 32e2f4c9dd06e0ef7314b24719235c0be297961f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Layer.cpp
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Util.h"
-
-#include "CostLayer.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/utils/Error.h"
-#include "paddle/utils/Logging.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "ValidationLayer.h"
-#endif
-
-DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
-
-namespace paddle {
-
-Layer::Layer(const LayerConfig& config, bool useGpu)
-    : config_(config),
-      useGpu_(useGpu),
-      deviceId_(CPU_DEVICE),
-      needSequenceInfo_(true) {}
-
-bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
-  if (useGpu_ && FLAGS_parallel_nn) {
-    /* gpu environment is specified by device property */
-    deviceId_ = config_.device();
-    if (deviceId_ < 0) {
-      useGpu_ = false;
-    }
-  }
-
-  output_.deviceId = deviceId_;
-
-  for (auto& inputConfig : config_.inputs()) {
-    std::string inputName = inputConfig.input_layer_name();
-    LayerPtr inputLayer;
-    CHECK(mapGet(inputName, layerMap, &inputLayer))
-        << "Cannot find input layer " << inputName << " for layer "
-        << getName();
-    this->addPrev(inputLayer);
-
-    inputLayer->addOutputArgument(deviceId_);
-
-    if (inputConfig.has_input_parameter_name()) {
-      ParameterPtr parameter;
-      CHECK(
-          mapGet(inputConfig.input_parameter_name(), parameterMap, &parameter))
-          << "Cannot find input parameter "
-          << inputConfig.input_parameter_name() << " for layer " << getName();
-      parameter->incShared();
-      CHECK_EQ(parameter->getDeviceId(), getDeviceId());
-      parameters_.push_back(parameter);
-    } else {
-      parameters_.push_back(nullptr);
-    }
-
-    if (inputConfig.has_input_layer_argument()) {
-      inputArgument_.push_back(inputConfig.input_layer_argument());
-    } else {
-      inputArgument_.push_back("");
-    }
-  }
-
-  if (config_.has_bias_parameter_name()) {
-    CHECK(mapGet(config_.bias_parameter_name(), parameterMap, &biasParameter_))
-        << "Cannot find bias parameter " << config_.bias_parameter_name()
-        << " for layer " << getName();
-    biasParameter_->incShared();
-    CHECK_EQ(biasParameter_->getDeviceId(), getDeviceId());
-  }
-
-  /* specify the activation function according to the configuration */
-  std::string action_type = config_.active_type();
-  activation_.reset(ActivationFunction::create(action_type));
-  CHECK(activation_);
-
-  initNeedFlags();
-  markInBackward_.assign(inputLayers_.size(), false);
-
-  return true;
-}
-
-ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
-
-LayerPtr Layer::create(const LayerConfig& config) {
-  std::string type = config.type();
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  // NOTE: As following types have illegal character '-',
-  // they can not use REGISTER_LAYER to registrar.
-  // Besides, to fit with old training models,
-  // they can not use '_' instead.
-  if (type == "multi-class-cross-entropy")
-    return LayerPtr(new MultiClassCrossEntropy(config));
-  else if (type == "rank-cost")
-    return LayerPtr(new RankingCost(config));
-  else if (type == "auc-validation")
-    return LayerPtr(new AucValidation(config));
-  else if (type == "pnpair-validation")
-    return LayerPtr(new PnpairValidation(config));
-#endif
-
-  return LayerPtr(registrar_.createByType(config.type(), config));
-}
-
-void Layer::resetSpecifyOutput(Argument& output,
-                               size_t height,
-                               size_t width,
-                               bool isValueClean,
-                               bool isGradClean) {
-  SetDevice device(output.deviceId);
-
-  Matrix::resizeOrCreate(
-      output.value, height, width, /* trans */ false, useGpu(output.deviceId));
-  if (isValueClean) {
-    output.value->zeroMem();
-  }
-
-  if (passType_ != PASS_TEST && needGradient()) {
-    Matrix::resizeOrCreate(
-        output.grad, height, width, /* trans */ false, useGpu(output.deviceId));
-    if (isGradClean) {
-      output.grad->zeroMem();
-    }
-  }
-}
-
-void Layer::resizeOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, false, false);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, false);
-  }
-}
-
-void Layer::reserveOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, false, true);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, true);
-  }
-}
-
-void Layer::resetOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, true, true);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, true, true);
-  }
-}
-
-void Layer::addOutputArgument(int deviceId) {
-  if (deviceId == deviceId_) {
-    output_.countIncrement();
-    return;
-  } else {
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].deviceId == deviceId) {
-        outputOtherDevice_[i].countIncrement();
-        return;
-      }
-    }
-  }
-
-  Argument argu;
-  argu.deviceId = deviceId;
-  outputOtherDevice_.push_back(argu);
-  outputOtherDevice_.back().countIncrement();
-}
-
-void Layer::copyOutputToOtherDevice() {
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    SetDevice device(outputOtherDevice_[i].deviceId);
-    // If outputOtherDevice_[i].value is a CpuMatrix,
-    // the copyFrom is a synchronous interface.
-    // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent
-    // calculations are all on HPPL_STREAM_DEFAULT,
-    // copyFrom can be an asynchronous interface.
-    outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
-                                          HPPL_STREAM_DEFAULT);
-    outputOtherDevice_[i].sequenceStartPositions =
-        output_.sequenceStartPositions;
-    outputOtherDevice_[i].subSequenceStartPositions =
-        output_.subSequenceStartPositions;
-    outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
-
-    outputOtherDevice_[i].notifyValueReady();
-  }
-}
-
-void Layer::waitInputValue() {
-  for (size_t i = 0; i != inputLayers_.size(); i++) {
-    if (inputLayers_[i]->getDeviceId() != deviceId_) {
-      getInput(i).waitValueReady();
-    }
-  }
-}
-
-void Layer::waitAndMergeOutputGrad() {
-  if (!output_.grad || !outputOtherDevice_.size()) {
-    return;
-  }
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    outputOtherDevice_[i].waitGradReady();
-  }
-
-  /* merge output grad */
-  size_t i = 0;
-  if (!output_.getAllCount()) {
-    output_.grad->copyFrom(*outputOtherDevice_[0].grad, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-
-    i++;
-    if (outputOtherDevice_.size() == 1) return;
-  }
-
-  Matrix::resizeOrCreate(tmpGrad_,
-                         output_.grad->getHeight(),
-                         output_.grad->getWidth(),
-                         /* trans */ false,
-                         useGpu(output_.deviceId));
-
-  for (; i != outputOtherDevice_.size(); i++) {
-    tmpGrad_->copyFrom(*outputOtherDevice_[i].grad, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-    output_.grad->add(*tmpGrad_);
-  }
-}
-
-void Layer::markAllInputGrad() {
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (!markInBackward_[i]) {
-      inputLayers_[i]->getOutput(deviceId_).notifyGradReady();
-    }
-    markInBackward_[i] = false;
-  }
-}
-
-void Layer::markInputGrad(int inputIndex) {
-  inputLayers_[inputIndex]->getOutput(deviceId_).notifyGradReady();
-  markInBackward_[inputIndex] = true;
-}
-
-void Layer::zeroGrad() {
-  CHECK(output_.grad.get() != NULL);
-  output_.grad->zeroMem();
-}
-
-void Layer::initNeedFlags() {
-  auto initFlag = [this](
-      bool& flag, bool (Layer::*flagQueryFunc)() const, ParameterType type) {
-    flag = false;
-    if (biasParameter_ && biasParameter_->hasType(type)) {
-      flag = true;
-    }
-    if (!flag) {
-      for (auto& para : parameters_) {
-        if (para && para->hasType(type)) {
-          flag = true;
-          break;
-        }
-      }
-    }
-    if (!flag) {
-      for (auto& layer : inputLayers_) {
-        if ((layer.get()->*flagQueryFunc)()) {
-          flag = true;
-        }
-      }
-    }
-  };
-  initFlag(needGradient_, &Layer::needGradient, PARAMETER_GRADIENT);
-}
-
-void Layer::showOutputStats() {
-  MatrixPtr out = getOutputValue();
-  if (!out) return;
-  if (!out->getElementCnt()) {
-    LOG(INFO) << "The number of output of " << config_.name()
-              << " is 0, skip to show the statistics";
-    return;
-  }
-  MatrixPtr outSquare;
-  if (dynamic_cast<GpuSparseMatrix*>(out.get())) {
-    GpuSparseMatrix* tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
-    outSquare = std::make_shared<CpuSparseMatrix>(tmp->getHeight(),
-                                                  tmp->getWidth(),
-                                                  tmp->getElementCnt(),
-                                                  tmp->getValueType(),
-                                                  tmp->getFormat());
-  } else {
-    outSquare = out->clone();
-  }
-  outSquare->copyFrom(*out, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-
-  real mean = outSquare->getSum() / out->getElementCnt();
-  real min;
-  real max;
-  if (dynamic_cast<CpuSparseMatrix*>(outSquare.get())) {
-    auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
-    min = tmpMat->getMin();
-    max = tmpMat->getMax();
-    tmpMat->square2();
-    LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
-  } else {
-    min = outSquare->getMin();
-    max = outSquare->getMax();
-    outSquare->square2();
-  }
-  real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
-  std = std > 0 ? std : 0;
-  LOG(INFO) << "The output state of " << config_.name() << ": mean=" << mean
-            << ", "
-            << "std=" << std << ", "
-            << "min=" << min << ", "
-            << "max=" << max;
-}
-
-void Layer::forwardActivation() {
-  /* activation */
-  auto status = activation_->forward(output_);
-  status.check();
-
-  /* dropout */
-  if (config_.drop_rate() > 0) {
-    forwardDropOut();
-    CHECK_NE(activation_->getName(), "softmax")
-        << "Softmax activation cannot be used with Dropout";
-  }
-
-  if (FLAGS_show_layer_stat) {
-    showOutputStats();
-  }
-}
-
-void Layer::backwardActivation() {
-  /* Do error clipping */
-  if (config_.error_clipping_threshold() > 0.0f) {
-    if (FLAGS_log_error_clipping) {
-      VectorPtr outGradVec = Vector::create(
-          output_.grad->getData(), output_.grad->getElementCnt(), useGpu_);
-      real maxAbsGrad = outGradVec->getAbsMax();
-      if (maxAbsGrad > config_.error_clipping_threshold()) {
-        real avgAbsGrad = outGradVec->getAbsSum() / outGradVec->getSize();
-        LOG(INFO) << " layer=" << config_.name() << " need clipping,"
-                  << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
-      }
-    }
-    output_.grad->clip(-config_.error_clipping_threshold(),
-                       config_.error_clipping_threshold());
-  }
-
-  /* Do dropout for delta*/
-  if (config_.drop_rate() > 0 && passType_ != PASS_TEST) {
-    MatrixPtr oGrad = getOutputGrad();
-    oGrad->dotMul(*oGrad, *dropOutMask_);
-  }
-
-  auto status = activation_->backward(output_);
-  status.check();
-}
-
-void Layer::forwardDropOut() {
-  auto& outV = getOutputValue();
-
-  if (passType_ == PASS_TRAIN) {
-    // new dropOutMask_ if dropOutMask_ is null ptr
-    Matrix::resizeOrCreate(dropOutMask_,
-                           outV->getHeight(),
-                           outV->getWidth(),
-                           false,
-                           useGpu(deviceId_));
-    dropOutMask_->randomizeUniform();  // generate a uniform random matrix
-    dropOutMask_->biggerThanScalar(config_.drop_rate());  // random mask
-    outV->dotMul(*outV, *dropOutMask_);                   // dropout
-  } else if (passType_ == PASS_GC) {
-    // only initialize once
-    if (!dropOutMask_) {
-      dropOutMask_ = Matrix::create(
-          outV->getHeight(), outV->getWidth(), false, useGpu(deviceId_));
-      // We use cpu matrix to generate mask so that the mask
-      // will be same for both gpu version and cpu version.
-      // This will help unittest to make sure they have same result.
-      MatrixPtr tmpMask = Matrix::create(outV->getHeight(), outV->getWidth());
-      tmpMask->randomizeUniform();  // generate a uniform random matrix
-      tmpMask->biggerThanScalar(config_.drop_rate());  // random mask
-      dropOutMask_->copyFrom(*tmpMask);
-    }
-    outV->dotMul(*outV, *dropOutMask_);
-  } else {  // passType == PASS_TEST
-    outV->mulScalar(1.0 - config_.drop_rate());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
deleted file mode 100644
index 13e20e8316323f9082a9615041584685853aa395..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Layer.h
+++ /dev/null
@@ -1,512 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include "ModelConfig.pb.h"
-#include "paddle/function/Function.h"
-#include "paddle/gserver/activations/ActivationFunction.h"
-#include "paddle/math/CpuSparseMatrix.h"
-#include "paddle/parameter/Argument.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/Weight.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Util.h"
-
-/// Macro for registering a layer type.
-/// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
-#define REGISTER_LAYER(__type_name, __class_name) \
-  static InitFunction __reg_type_##__type_name(   \
-      []() { Layer::registrar_.registerClass<__class_name>(#__type_name); })
-
-#define REGISTER_LAYER_CREATE_FUNC(__type_name, createFunction) \
-  static InitFunction __reg_type_##__type_name(                 \
-      []() { Layer::registrar_.registerClass(#__type_name, createFunction); })
-
-namespace paddle {
-
-class Layer;
-typedef std::shared_ptr<Layer> LayerPtr;
-typedef std::map<std::string, LayerPtr> LayerMap;
-class NeuralNetwork;
-
-/// layer state, used for RNN and LSTM layers
-struct LayerState {
-  std::vector<MatrixPtr> value;
-};
-typedef std::shared_ptr<LayerState> LayerStatePtr;
-
-/// Paddle device ID, MKLDNN is -2, CPU is -1
-enum PADDLE_DEVICE_ID {
-  MKLDNN_DEVICE = -2,
-  CPU_DEVICE = -1,
-};
-
-/**
- * @brief Base class for layer.
- * Define necessary variables and functions for every layer.
- */
-class Layer {
- protected:
-  /// Layer config
-  LayerConfig config_;
-  /// whether to use GPU
-  bool useGpu_;
-  /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
-  int deviceId_;
-  /// Input layers
-  std::vector<LayerPtr> inputLayers_;
-  /// Argument of input layers
-  std::vector<std::string> inputArgument_;
-
-  /// Parameter for each input layer.
-  /// Parameters_[i] is nullptr if inputLayers_[i] does not need parameter.
-  std::vector<ParameterPtr> parameters_;
-
-  /// nullptr if bias is not needed.
-  ParameterPtr biasParameter_;
-
-  /// Output
-  Argument output_;
-  /// Several outputs stored on different devices, used in 'parallel_nn' case,
-  /// and record them by deviceId_.
-  /// Also used in 'use_mkldnn' case.
-  std::vector<Argument> outputOtherDevice_;
-  /// If there are several outputs, map them by each name.
-  /// MKLDNNLayer use it only to merge output grad
-  std::map<std::string, Argument*> outputMap_;
-  /// Used to merge grad on different devices.
-  MatrixPtr tmpGrad_;
-
-  std::unique_ptr<ActivationFunction> activation_;
-
-  /// Current passType, PASS_TRAIN or PASS_TEST
-  PassType passType_;
-
-  /// Random 0-1 matrix for dropOut
-  MatrixPtr dropOutMask_;
-
-  /// Whether the layer need to compute gradient
-  bool needGradient_;
-  /// Whether the layer need to compute re-sequence information
-  bool needSequenceInfo_;
-
-  /// Mark input grad in(true) or out(false) of backward function.
-  std::vector<bool> markInBackward_;
-
-  /// Layer forward function
-  std::vector<std::shared_ptr<FunctionBase>> forward_;
-  /// Layer backward function
-  std::vector<std::shared_ptr<FunctionBase>> backward_;
-
- public:
-  /**
-   * Wait until all input value ready.
-   * Called before Layer::forward() function.
-   */
-  virtual void waitInputValue();
-
-  /**
-   * Copy layer's output_ to other device.
-   * If output layer is in other device, called after Layer::forward() function.
-   */
-  virtual void copyOutputToOtherDevice();
-
-  /**
-   * Wait until all output grad ready and merge them to output_.grad.
-   * Called before Layer::backward() function.
-   */
-  virtual void waitAndMergeOutputGrad();
-
-  /**
-   * Notify previous layer the output grad ready.
-   * Called after Layer::backward() function.
-   */
-  virtual void markAllInputGrad();
-
- protected:
-  /**
-   * Create layer function. Function is called in forward or backward.
-   * \param function, Layer::forward_ or Layer::backward_
-   * \param name, function name
-   * \param config, initialization configuration for the function
-   */
-  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
-                      const std::string& name,
-                      const FuncConfig& config) {
-    if (useGpu_) {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
-    } else {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
-    }
-    auto& func = function.back();
-    func->init(config);
-  }
-
-  /**
-   * Notify specified layer the output grad ready.
-   * Called in the backward function.
-   * If do mark input grad in the backward function, you should to ensure
-   * that all input grad will be marked in the backward function.
-   */
-  void markInputGrad(int inputIndex);
-
-  /**
-   * Get the argument of input layer.
-   */
-  const Argument& getInput(size_t inputIndex) const {
-    return inputLayers_[inputIndex]->getOutput(deviceId_);
-  }
-
-  /**
-   * Get the argument of input layer.
-   */
-  const Argument& getInput(const Layer& inputLayer) const {
-    return inputLayer.getOutput(deviceId_);
-  }
-
-  /**
-   * Get the argument of input layer with deviceId.
-   */
-  const Argument& getInput(size_t inputIndex, int deviceId) const {
-    return inputLayers_[inputIndex]->getOutput(deviceId);
-  }
-
-  /**
-   * Get the forward-input value.
-   */
-  const MatrixPtr& getInputValue(int inputIndex) {
-    return inputLayers_[inputIndex]->getOutput(deviceId_).value;
-  }
-
-  /**
-   * Get the forward-input value.
-   */
-  const MatrixPtr& getInputValue(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).value;
-  }
-
-  /**
-   * Get the forward-input value with deviceId.
-   */
-  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
-    return inputLayers_[inputIndex]->getOutput(deviceId).value;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(int inputIndex) {
-    return inputLayers_[inputIndex]->getOutput(deviceId_).grad;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).grad;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
-    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
-  }
-
-  /**
-   * Get the forward-input label.
-   */
-  const IVectorPtr& getInputLabel(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).ids;
-  }
-
-  /**
-   * Change the size of output (value, grad).
-   * Reset to value zero if isValueClean = true,
-   * Reset to grad zero if isGradClean = true.
-   */
-  void resetSpecifyOutput(Argument& output,
-                          size_t height,
-                          size_t width,
-                          bool isValueClean,
-                          bool isGradClean);
-
-  /**
-   * Add output argument to other devices.
-   */
-  void addOutputArgument(int deviceId);
-
- public:
-  explicit Layer(const LayerConfig& config, bool useGpu = FLAGS_use_gpu);
-  virtual ~Layer() {}
-
-  /// Register a Layer
-  static ClassRegistrar<Layer, LayerConfig> registrar_;
-
-  /**
-   * Get the flag whether layer need to compute gradient.
-   */
-  bool needGradient() const { return needGradient_; }
-
-  /**
-   * Set the flag whether layer need to compute gradient.
-   */
-  void setNeedGradient(bool need) { needGradient_ = need; }
-
-  /**
-   * Set the flag whether layer need to re-compute sequence information,
-   * which includes sequenceStartPositions or subSequenceStartPositions.
-   */
-  void setNeedSequenceInfo(bool need) { needSequenceInfo_ = need; }
-
-  /**
-   * Get layer's name.
-   */
-  const std::string& getName() const { return config_.name(); }
-
-  /**
-   * Get layer's type.
-   */
-  const std::string& getType() const { return config_.type(); }
-
-  /**
-   * Get layer's size.
-   */
-  size_t getSize() const { return config_.size(); }
-
-  /**
-   * Get layer's deviceId.
-   */
-  int getDeviceId() const { return deviceId_; }
-
-  /**
-   * Add the inputLayer.
-   */
-  void addPrev(LayerPtr l) { inputLayers_.push_back(l); }
-
-  /**
-   * Get the size of inputLayer[i].
-   */
-  const LayerPtr& getPrev(size_t i) { return inputLayers_[i]; }
-
-  /**
-   * Get the forward-output value.
-   */
-  const MatrixPtr& getOutputValue() { return output_.value; }
-
-  /**
-   * Get the forward-output label.
-   */
-  const IVectorPtr& getOutputLabel() { return output_.ids; }
-
-  /**
-   * Get the backward-Loss value.
-   */
-  const MatrixPtr& getOutputGrad() { return output_.grad; }
-  /**
-   * If layer has multi-output, set output into outputMap_.
-   */
-  void setOutput(const std::string& name, Argument* output) {
-    outputMap_[name] = output;
-  }
-
-  /**
-   * Get the output map size, if layer has multi-output.
-   */
-  size_t getOutputMapSize() { return outputMap_.size(); }
-
-  /**
-   * Get the output based on layer's name.
-   */
-  Argument& getOutput(const std::string& str = "") {
-    if (str == "") {
-      return output_;
-    } else {
-      auto output = outputMap_.find(str);
-      if (output != outputMap_.end()) {
-        return *output->second;
-      } else {
-        LOG(FATAL) << "No specific output " << str;
-        return *((Argument*)nullptr);
-      }
-    }
-  }
-
-  /**
-   * Get the output based on deviceId.
-   */
-  const Argument& getOutput(int deviceId) const {
-    if (deviceId == getDeviceId()) {
-      return output_;
-    } else {
-      for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-        if (outputOtherDevice_[i].deviceId == deviceId) {
-          return outputOtherDevice_[i];
-        }
-      }
-
-      LOG(FATAL) << "No specific device output ";
-      return *((Argument*)nullptr);
-    }
-  }
-
-  /**
-   * Get layer's parameters.
-   */
-  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
-
-  /**
-   * Get layer's bias-parameters.
-   */
-  const ParameterPtr& getBiasParameter() { return biasParameter_; }
-
-  /**
-   * Create pointer of layer.
-   */
-  static LayerPtr create(const LayerConfig& config);
-
-  /**
-   * Resize the output matrix size.
-   */
-  void resizeOutput(size_t height, size_t width);
-
-  /**
-   * Resize the output matrix size,
-   * and reset value to zero.
-   */
-  void reserveOutput(size_t height, size_t width);
-
-  /**
-   * Resize the output matrix size,
-   * and reset value and grad to zero.
-   */
-  void resetOutput(size_t height, size_t width);
-
-  /**
-   * Clear the gradient of output.
-   */
-  void zeroGrad();
-
-  /**
-   * Intialization.
-   * For example, adding input layers from layerMap and parameterMap.
-   */
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  /**
-   * Intialization for sub network if there has sub network.
-   * @param rootNetwork root network
-   * @param config model config
-   * @param parameterTypes parameter's type
-   * @param useGpu whether to use gpu or not
-   */
-  virtual void initSubNetwork(NeuralNetwork* rootNetwork,
-                              const ModelConfig& config,
-                              const std::vector<ParameterType>& parameterTypes,
-                              bool useGpu) {}
-
-  /**
-   * @brief Access SubNetwork Object.
-   *        If subnetwork exists, then invoke callback with subnetwrk.
-   * @param callback if sub-network is exist, the callback is invoked.
-   */
-  virtual void accessSubNetwork(
-      const std::function<void(NeuralNetwork&)>& callback) {}
-
-  /**
-   * If use sparse row matrix as parameter,
-   * prefetch feature ids in input label.
-   */
-  virtual void prefetch() {}
-
-  /**
-   * Forward propagation.
-   * All inherited implementation should call Layer::foward() function.
-   */
-  virtual void forward(PassType passType) {
-    passType_ = passType;
-    if (!inputLayers_.empty() && needSequenceInfo_) {
-      const Argument& input = getInput(0);
-      output_.sequenceStartPositions = input.sequenceStartPositions;
-      output_.subSequenceStartPositions = input.subSequenceStartPositions;
-      output_.cpuSequenceDims = input.cpuSequenceDims;
-    }
-  }
-
-  /**
-   * Reset the internal state variables.
-   * Allocate them if they have not been allocated.
-   * This function need to called before Layer::forward() for generating
-   * sequence.
-   *
-   * This is used for sequence generation. When generating sequence, the
-   * calculation at current timestamp depends on the state from previous
-   * timestamp. The model needs to keep the information about the previous
-   * timestamp in the state variables. Layers such as RecurrentLayer,
-   * LstmLayer and ContextLayer have state variables.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Get layer state.
-   * @return A copy of internal state.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
-  /**
-   * Show output state.
-   */
-  void showOutputStats();
-
-  /**
-   * Backward propagation.
-   * Should only be called after Layer::forward() function.
-   */
-  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
-
-  /**
-   * One pass is finished.
-   */
-  virtual void onPassEnd() {}
-
- protected:
-  /**
-   * Forward of activation function.
-   */
-  void forwardActivation();
-  /**
-   * Backward of activation function.
-   */
-  void backwardActivation();
-  /**
-   * Forward of dropOut.
-   */
-  void forwardDropOut();
-  /**
-   * Initilize the needGradient_ flag.
-   */
-  void initNeedFlags();
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
deleted file mode 100644
index e802b701d0237bed44adc83273fe53c3e18c92ec..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-class LinearChainCRF {
- public:
-  /**
-   * The size of para must be \f$(numClasses + 2) * numClasses\f$.
-   * The first numClasses values of para are for starting weights (\f$a\f$).
-   * The next numClasses values of para are for ending weights (\f$b\f$),
-   * The remaning values are for transition weights (\f$w\f$).
-   *
-   * The probability of a state sequence s of length \f$L\f$ is defined as:
-   * \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
-   *                  + \sum_{l=1}^L x_{s_l}
-   *                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
-   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
-   * all possible
-   * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
-   */
-  LinearChainCRF(int numClasses, real* para);
-
-  /**
-   * Calculate the negative log likelihood of s given x.
-   * The size of x must be length * numClasses. Each consecutive numClasses
-   * values are the features for one time step.
-   */
-  real forward(real* x, int* s, int length);
-
-  /**
-   * Calculate the gradient with respect to x, a, b, and w.
-   * backward() can only be called after a corresponding call to forward() with
-   * the same x, s and length.
-   * The gradient with respect to a, b, and w will not be calculated if
-   * needWGrad is false.
-   * @note Please call getWGrad() and getXGrad() to get the gradient with
-   * respect to (a, b, w) and x respectively.
-   */
-  void backward(real* x, int* s, int length, bool needWGrad);
-
-  /**
-   * Find the most probable sequence given x. The result will be stored in s.
-   */
-  void decode(real* x, int* s, int length);
-
-  /*
-   * Return the gradient with respect to (a, b, w). It can only be called after
-   * a corresponding call to backward().
-   */
-  MatrixPtr getWGrad() { return matWGrad_; }
-
-  /*
-   * Return the gradient with respect to x. It can only be called after a
-   * corresponding call to backward().
-   */
-  MatrixPtr getXGrad() { return matGrad_; }
-
- protected:
-  int numClasses_;
-  MatrixPtr a_;
-  MatrixPtr b_;
-  MatrixPtr w_;
-  MatrixPtr matWGrad_;
-  MatrixPtr da_;
-  MatrixPtr db_;
-  MatrixPtr dw_;
-  MatrixPtr ones_;
-
-  MatrixPtr expX_;
-  MatrixPtr matGrad_;
-  MatrixPtr alpha_;
-  MatrixPtr beta_;
-  MatrixPtr maxX_;
-  MatrixPtr expW_;
-
-  // track_(k,i) = j means that the best sequence at time k for class i comes
-  // from the sequence at time k-1 for class j
-  IVectorPtr track_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCTC.h b/paddle/gserver/layers/LinearChainCTC.h
deleted file mode 100644
index 5b325a0deb0e9d8df241175159321e52f527f6c4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LinearChainCTC.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-class LinearChainCTC {
- public:
-  LinearChainCTC(int numClasses, bool normByTimes);
-
-  // Calculate the negative log probability as loss
-  real forward(real* softmaxSeq,
-               int softmaxSeqLen,
-               int* labelSeq,
-               int labelSeqLen);
-
-  // calculate the gradient
-  void backward(real* softmaxSeq,
-                real* softmaxSeqGrad,
-                int* labelSeq,
-                int labelSeqLen);
-
- protected:
-  int numClasses_, blank_, totalSegments_, totalTime_;
-  bool normByTimes_;
-  bool isInvalid_;
-
-  MatrixPtr logActs_, forwardVars_, backwardVars_, gradTerms_;
-
-  real logProb_;
-
-  void segmentRange(int& start, int& end, int time);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmLayer.cpp b/paddle/gserver/layers/LstmLayer.cpp
deleted file mode 100644
index f65ae6a3e69cb5f0a7e6073d17bfd0beae91cd5d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LstmLayer.cpp
+++ /dev/null
@@ -1,805 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmLayer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Stat.h"
-
-DECLARE_bool(prev_batch_state);
-
-namespace paddle {
-
-REGISTER_LAYER(lstmemory, LstmLayer);
-
-bool LstmLayer::init(const LayerMap &layerMap,
-                     const ParameterMap &parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-  CHECK_EQ(getSize() * getSize() * 4, parameters_[0]->getSize());
-  CHECK_EQ(getSize() * 7, biasParameter_->getSize());
-  weight_.reset(new Weight(getSize(), getSize() * 4, parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, getSize() * 7, biasParameter_));
-    if (bias_->getW()) {
-      localBias_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  getSize() * 4,
-                                  /* trans= */ false,
-                                  useGpu_);
-      checkIg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-      checkFg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-      checkOg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-
-      localBias_->setData(bias_->getW()->getData());
-      checkIg_->setData(bias_->getW()->getData() + getSize() * 4);
-      checkFg_->setData(bias_->getW()->getData() + getSize() * 5);
-      checkOg_->setData(bias_->getW()->getData() + getSize() * 6);
-    }
-
-    if (bias_->getWGrad()) {
-      localBiasGrad_ = Matrix::create(nullptr,
-                                      /* height= */ 1,
-                                      getSize() * 4,
-                                      /* trans= */ false,
-                                      useGpu_);
-      checkIgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      checkFgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      checkOgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      localBiasGrad_->setData(bias_->getWGrad()->getData());
-      checkIgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 4);
-      checkFgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 5);
-      checkOgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 6);
-    }
-  } else {
-    LOG(FATAL) << "Bias should be here.";
-  }
-  reversed_ = config_.reversed();
-
-  // create IdentityActivation for using drop_rate
-  activation_.reset(ActivationFunction::create(""));
-
-  LstmCompute::init(config_);
-  useBatch_ = true;
-  useSeqParallel_ = false;
-  if (useGpu_ && (getSize() == 32 || getSize() == 64)) {
-    useSeqParallel_ = true;
-  }
-
-  return true;
-}
-
-void LstmLayer::resetState() {
-  CHECK(!reversed_) << "state is not allowed for reversed lstmemory layer";
-  Matrix::resizeOrCreate(
-      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
-  Matrix::resizeOrCreate(prevState_, 1, getSize(), /* trans= */ false, useGpu_);
-  prevOutput_->resize(0, getSize());
-  prevState_->resize(0, getSize());
-  if (FLAGS_prev_batch_state) {
-    useBatch_ = true;
-  } else {
-    useBatch_ = false;
-  }
-}
-
-void LstmLayer::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 2) << "two matrices are expected for LSTM state";
-  prevOutput_->resize(state->value[0]->getHeight(),
-                      state->value[0]->getWidth());
-  prevState_->resize(state->value[1]->getHeight(), state->value[1]->getWidth());
-  prevOutput_->copyFrom(*(state->value[0]));
-  prevState_->copyFrom(*(state->value[1]));
-}
-
-LayerStatePtr LstmLayer::getState() {
-  LayerStatePtr res = std::make_shared<LayerState>();
-  if (prevOutput_->getHeight() && prevOutput_->getWidth()) {
-    res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
-    res->value[0]->copyFrom(*prevOutput_);
-    res->value.push_back(prevState_->clone(0, 0, useGpu_));
-    res->value[1]->copyFrom(*prevState_);
-  } else {
-    MatrixPtr output =
-        Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
-    MatrixPtr state = Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
-    output->resize(0, getSize());
-    state->resize(0, getSize());
-    res->value.push_back(output);
-    res->value.push_back(state);
-  }
-  return res;
-}
-
-void LstmLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("LstmFwTimer", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument &input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  resetOutput(batchSize, getSize());
-  CHECK_EQ(getSize() * 4, input.value->getWidth());
-  size_t numSequences = input.getNumSequences();
-  const int *starts = input.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize,
-                         getSize() * 4,
-                         /* trans= */ false,
-                         useGpu_);
-  if (prevOutput_) {
-    size_t prevNumSeq = useBatch_ ? numSequences : 1;
-    if (prevOutput_->getHeight() == 0) {
-      prevOutput_->resize(prevNumSeq, getSize());
-      prevState_->resize(prevNumSeq, getSize());
-      prevOutput_->zeroMem();
-      prevState_->zeroMem();
-    } else {
-      CHECK_EQ(prevOutput_->getHeight(), prevNumSeq)
-          << "the number of sequences must be the same";
-    }
-    Matrix::resizeOrCreate(totalState_,
-                           prevState_->getHeight() + batchSize,
-                           getSize(),
-                           /*trans*/ false,
-                           useGpu_);
-    state_.value = Matrix::create(nullptr,
-                                  /* height= */ batchSize,
-                                  getSize(),
-                                  /* trans= */ false,
-                                  useGpu_);
-    state_.value->setData(totalState_->getData() +
-                          prevState_->getHeight() * getSize());
-  } else {
-    Matrix::resizeOrCreate(state_.value,
-                           /* height= */ batchSize,
-                           getSize(),
-                           /* trans= */ false,
-                           useGpu_);
-  }
-  Matrix::resizeOrCreate(preOutput_.value,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-
-  if (!useBatch_) {
-    forwardSequence(batchSize, numSequences, starts, input.value);
-  } else {
-    if (!useSeqParallel_) {
-      forwardBatch(batchSize, numSequences, starts, input.value);
-    } else {
-      const int *starts = input.sequenceStartPositions->getData(useGpu_);
-      forwardSeqParallel(batchSize, numSequences, starts, input.value);
-    }
-  }
-  /*  activation */ { forwardActivation(); }
-}
-
-void LstmLayer::backward(const UpdateCallback &callback) {
-  REGISTER_TIMER_INFO("LstmBwTimer", getName().c_str());
-  /*  Do derivation */ { backwardActivation(); }
-
-  const Argument &input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-
-  Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize,
-                         getSize() * 4,
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(state_.grad,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(preOutput_.grad,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-  state_.grad->zero();
-
-  const int *starts = input.sequenceStartPositions->getData(false);
-  if (!useBatch_) {
-    backwardSequence(batchSize, numSequences, starts, input.grad);
-  } else {
-    if (!useSeqParallel_) {
-      backwardBatch(batchSize, numSequences, starts, input.grad);
-    } else {
-      const int *starts = input.sequenceStartPositions->getData(useGpu_);
-      backwardSeqParallel(batchSize, numSequences, starts, input.grad);
-    }
-  }
-
-  if (bias_) {
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void LstmLayer::forwardSequence(int batchSize,
-                                size_t numSequences,
-                                const int *starts,
-                                MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwSequenceTime", getName().c_str());
-  gate_.value->assign(*inputValue);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.stateActiveValue = preOutput_.value->getData();
-  lstmValue.outputValue = output_.value->getData();
-  lstmValue.prevStateValue = nullptr;
-  if (reversed_) {
-    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
-    lstmValue.stateValue += (batchSize - 1) * getSize();
-    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
-    lstmValue.outputValue += (batchSize - 1) * getSize();
-  }
-
-  auto nextFrame = [&lstmValue](bool reversed, int frameSize) {
-    lstmValue.prevStateValue = lstmValue.stateValue;
-    if (!reversed) {
-      lstmValue.gateValue += frameSize * 4;
-      lstmValue.stateValue += frameSize;
-      lstmValue.stateActiveValue += frameSize;
-      lstmValue.outputValue += frameSize;
-    } else {
-      lstmValue.gateValue -= frameSize * 4;
-      lstmValue.stateValue -= frameSize;
-      lstmValue.stateActiveValue -= frameSize;
-      lstmValue.outputValue -= frameSize;
-    }
-  };
-
-  MatrixPtr frameGate = Matrix::create(nullptr,
-                                       /* height= */ 1,
-                                       getSize() * 4,
-                                       /* trans= */ false,
-                                       useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr,
-                                         /* height= */ 1,
-                                         getSize(),
-                                         /* trans= */ false,
-                                         useGpu_);
-
-  if (!reversed_) {
-    if (prevState_) {
-      lstmValue.prevStateValue = prevState_->getData();
-    }
-    if (prevOutput_) {
-      frameGate->setData(lstmValue.gateValue);
-      frameGate->mul(*prevOutput_, *weight_->getW(), 1, 1);
-    }
-  }
-  AsyncGpuBlock asyncGpuBlock;
-  for (size_t n = 0; n < numSequences; ++n) {
-    int length;
-    if (!reversed_) {
-      length = starts[n + 1] - starts[n];
-    } else {
-      length = starts[numSequences - n] - starts[numSequences - n - 1];
-    }
-    for (int l = 0; l < length; ++l) {
-      if (useGpu_) {
-        LstmCompute::forwardOneSequence<1>(lstmValue, getSize());
-      } else {
-        LstmCompute::forwardOneSequence<0>(lstmValue, getSize());
-      }
-
-      if (l != length - 1) {
-        frameOutput->setData(lstmValue.outputValue);
-        nextFrame(reversed_, getSize());
-        frameGate->setData(lstmValue.gateValue);
-        frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
-      }
-    }
-    if (n != numSequences - 1) {
-      frameOutput->setData(lstmValue.outputValue);
-      nextFrame(reversed_, getSize());
-      frameGate->setData(lstmValue.gateValue);
-      if (!reversed_) {
-        if (!prevState_) lstmValue.prevStateValue = nullptr;
-        if (prevOutput_) {
-          frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
-        }
-      } else {
-        lstmValue.prevStateValue = nullptr;
-      }
-    }
-  }
-
-  if (!reversed_) {
-    if (prevState_) {
-      prevState_->assign(*state_.value->subMatrix(batchSize - 1, 1));
-    }
-    if (prevOutput_) {
-      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
-    }
-  }
-}
-
-void LstmLayer::backwardSequence(int batchSize,
-                                 size_t numSequences,
-                                 const int *starts,
-                                 MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwSequenceTime", getName().c_str());
-  MatrixPtr weightT = weight_->getW()->getTranspose();
-
-  hl_lstm_value lstmValue;
-  hl_lstm_grad lstmGrad;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.stateActiveValue = preOutput_.value->getData();
-  lstmValue.outputValue = nullptr;
-
-  if (bias_->getWGrad()) {
-    lstmGrad.checkIgGrad = checkIgGrad_->getData();
-    lstmGrad.checkFgGrad = checkFgGrad_->getData();
-    lstmGrad.checkOgGrad = checkOgGrad_->getData();
-  } else {
-    lstmGrad.checkIgGrad = nullptr;
-    lstmGrad.checkFgGrad = nullptr;
-    lstmGrad.checkOgGrad = nullptr;
-  }
-  lstmGrad.gateGrad = gate_.grad->getData();
-  lstmGrad.stateGrad = state_.grad->getData();
-  lstmGrad.stateActiveGrad = nullptr;
-  lstmGrad.outputGrad = output_.grad->getData();
-
-  if (!reversed_) {
-    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
-    lstmGrad.gateGrad += (batchSize - 1) * getSize() * 4;
-    lstmValue.stateValue += (batchSize - 1) * getSize();
-    lstmGrad.stateGrad += (batchSize - 1) * getSize();
-    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
-    lstmGrad.outputGrad += (batchSize - 1) * getSize();
-    lstmValue.prevStateValue = lstmValue.stateValue - getSize();
-    lstmGrad.prevStateGrad = lstmGrad.stateGrad - getSize();
-  } else {
-    lstmValue.prevStateValue = lstmValue.stateValue + getSize();
-    lstmGrad.prevStateGrad = lstmGrad.stateGrad + getSize();
-  }
-
-  auto nextFrame = [&lstmValue, &lstmGrad](bool reversed, int frameSize) {
-    if (reversed) {
-      lstmValue.gateValue += frameSize * 4;
-      lstmGrad.gateGrad += frameSize * 4;
-      lstmValue.stateValue += frameSize;
-      lstmGrad.stateGrad += frameSize;
-      lstmValue.stateActiveValue += frameSize;
-      lstmGrad.outputGrad += frameSize;
-      lstmValue.prevStateValue = lstmValue.stateValue + frameSize;
-      lstmGrad.prevStateGrad = lstmGrad.stateGrad + frameSize;
-    } else {
-      lstmValue.gateValue -= frameSize * 4;
-      lstmGrad.gateGrad -= frameSize * 4;
-      lstmValue.stateValue -= frameSize;
-      lstmGrad.stateGrad -= frameSize;
-      lstmValue.stateActiveValue -= frameSize;
-      lstmGrad.outputGrad -= frameSize;
-      lstmValue.prevStateValue = lstmValue.stateValue - frameSize;
-      lstmGrad.prevStateGrad = lstmGrad.stateGrad - frameSize;
-    }
-  };
-
-  MatrixPtr frameGate = Matrix::create(nullptr,
-                                       /* height= */ 1,
-                                       getSize() * 4,
-                                       /* trans= */ false,
-                                       useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr,
-                                         /* height= */ 1,
-                                         getSize(),
-                                         /* trans= */ false,
-                                         useGpu_);
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    for (size_t n = 0; n < numSequences; ++n) {
-      int length;
-      int start;
-      if (reversed_) {
-        length = starts[n + 1] - starts[n];
-        start = starts[n];
-      } else {
-        length = starts[numSequences - n] - starts[numSequences - n - 1];
-        start = starts[numSequences - n - 1];
-      }
-      for (int l = 0; l < length; ++l) {
-        if (l == length - 1) {
-          lstmValue.prevStateValue = nullptr;
-          lstmGrad.prevStateGrad = nullptr;
-        }
-        if (useGpu_) {
-          LstmCompute::backwardOneSequence<1>(lstmValue, lstmGrad, getSize());
-        } else {
-          LstmCompute::backwardOneSequence<0>(lstmValue, lstmGrad, getSize());
-        }
-        if (l != length - 1) {
-          frameGate->setData(lstmGrad.gateGrad);
-          nextFrame(reversed_, getSize());
-          frameOutput->setData(lstmGrad.outputGrad);
-          frameOutput->mul(*frameGate, *weightT, 1, 1);
-        } else {
-          nextFrame(reversed_, getSize());
-        }
-      }
-
-      if (weight_->getWGrad()) {
-        if (!reversed_) {
-          weight_->getWGrad()->mul(
-              *output_.value->subMatrix(start, length - 1)->getTranspose(),
-              *gate_.grad->subMatrix(start + 1, length - 1),
-              1,
-              1);
-        } else {
-          weight_->getWGrad()->mul(
-              *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-              *gate_.grad->subMatrix(start, length - 1),
-              1,
-              1);
-        }
-      }
-    }
-  }
-
-  if (inputGrad) {
-    inputGrad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-  }
-}
-
-void LstmLayer::forwardBatch(int batchSize,
-                             size_t numSequences,
-                             const int *starts,
-                             MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwBatchTime", getName().c_str());
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-
-  if (!batchValue_) {
-    batchValue_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchValue_->resizeOrCreateBatch(
-      batchSize, numSequences, starts, reversed_, prevOutput_ ? true : false);
-
-  batchValue_->resizeOrCreate(*output_.value);
-  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  {
-    int numBatch = batchValue_->getNumBatch();
-    int batchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    if (prevState_) {
-      lstmValue.prevStateValue = totalState_->getData();
-    } else {
-      lstmValue.prevStateValue = nullptr;
-    }
-    for (int n = 0; n < numBatch; n++) {
-      MatrixPtr outputValue = batchValue_->getBatchValue(n);
-      MatrixPtr gateValue = batchValue_->getBatchValue(*gate_.value, n);
-      batchSize = outputValue->getHeight();
-
-      if (n != 0) {
-        MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize);
-        gateValue->mul(*batch1, *weight_->getW(), 1, 1);
-      } else if (prevOutput_) {
-        Matrix::resizeOrCreate(prevBatchOutput2_,
-                               gateValue->getHeight(),
-                               getSize(),
-                               false,
-                               useGpu_);
-        batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_);
-        gateValue->mul(*prevBatchOutput2_, *weight_->getW(), 1, 1);
-
-        batchValue_->prevOutput2Batch(*prevState_,
-                                      *totalState_->subMatrix(0, numSequences));
-      }
-
-      lstmValue.gateValue = gateValue->getData();
-      lstmValue.outputValue = outputValue->getData();
-      lstmValue.stateValue =
-          batchValue_->getBatchValue(*state_.value, n)->getData();
-      lstmValue.stateActiveValue =
-          batchValue_->getBatchValue(*preOutput_.value, n)->getData();
-      {
-        if (useGpu_) {
-          LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
-        } else {
-          LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
-        }
-      }
-      lstmValue.prevStateValue = lstmValue.stateValue;
-    }
-  }
-  {
-    REGISTER_TIMER_INFO("batchToSeq", getName().c_str());
-    batchValue_->copyBackSeq(*output_.value);
-  }
-  if (prevOutput_) {
-    getPrevBatchOutput(numSequences);
-    getPrevBatchState(numSequences);
-  }
-}
-
-void LstmLayer::getPrevBatchOutput(size_t numSequences) {
-  prevOutput_->resize(numSequences, getSize());
-  batchValue_->getSeqOutputFromBatch(*prevOutput_,
-                                     *batchValue_->getBatchValue());
-}
-
-void LstmLayer::getPrevBatchState(size_t numSequences) {
-  prevState_->resize(numSequences, getSize());
-  batchValue_->getSeqOutputFromBatch(*prevState_, *state_.value);
-}
-
-void LstmLayer::backwardBatch(int batchSize,
-                              size_t numSequences,
-                              const int *starts,
-                              MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwBatchTime", getName().c_str());
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-
-  hl_lstm_grad lstmGrad;
-  lstmGrad.stateActiveGrad = preOutput_.grad->getData();
-
-  if (bias_->getWGrad()) {
-    lstmGrad.checkIgGrad = checkIgGrad_->getData();
-    lstmGrad.checkFgGrad = checkFgGrad_->getData();
-    lstmGrad.checkOgGrad = checkOgGrad_->getData();
-  } else {
-    lstmGrad.checkIgGrad = nullptr;
-    lstmGrad.checkFgGrad = nullptr;
-    lstmGrad.checkOgGrad = nullptr;
-  }
-
-  if (!batchGrad_) {
-    batchGrad_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchGrad_->shareIndexWith(*batchValue_);
-
-  {
-    REGISTER_TIMER_INFO("seqToBatch", getName().c_str());
-    batchGrad_->copyFromSeq(*output_.grad);
-  }
-
-  {
-    MatrixPtr weightT = weight_->getW()->getTranspose();
-    int numBatch = batchGrad_->getNumBatch();
-    int batchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    for (int n = (int)numBatch - 1; n >= 0; n--) {
-      MatrixPtr outputGrad = batchGrad_->getBatchValue(n);
-      MatrixPtr gateGrad = batchGrad_->getBatchValue(*gate_.grad, n);
-
-      lstmValue.gateValue =
-          batchGrad_->getBatchValue(*gate_.value, n)->getData();
-      lstmValue.stateValue =
-          batchGrad_->getBatchValue(*state_.value, n)->getData();
-      lstmValue.stateActiveValue =
-          batchGrad_->getBatchValue(*preOutput_.value, n)->getData();
-      lstmGrad.stateGrad =
-          batchGrad_->getBatchValue(*state_.grad, n)->getData();
-      lstmGrad.gateGrad = gateGrad->getData();
-      lstmGrad.outputGrad = outputGrad->getData();
-      {
-        batchSize = outputGrad->getHeight();
-        if (n != 0) {
-          lstmValue.prevStateValue =
-              batchGrad_->getBatchValue(*state_.value, n - 1)->getData();
-          lstmGrad.prevStateGrad =
-              batchGrad_->getBatchValue(*state_.grad, n - 1)->getData();
-        } else {
-          if (prevState_) {
-            lstmValue.prevStateValue = totalState_->getData();
-            lstmGrad.prevStateGrad = nullptr;
-          } else {
-            lstmValue.prevStateValue = nullptr;
-            lstmGrad.prevStateGrad = nullptr;
-          }
-        }
-        if (useGpu_) {
-          LstmCompute::backwardBatch<1>(
-              lstmValue, lstmGrad, getSize(), batchSize);
-        } else {
-          LstmCompute::backwardBatch<0>(
-              lstmValue, lstmGrad, getSize(), batchSize);
-        }
-      }
-
-      if (n != 0) {
-        MatrixPtr tmp = batchGrad_->getBatchValue(n - 1, batchSize);
-        tmp->mul(*gateGrad, *weightT, 1, 1);
-      }
-
-      if (n != 0 && weight_->getWGrad()) {
-        /* backward weight */
-        MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize);
-        weight_->getWGrad()->mul(*outputValue->getTranspose(), *gateGrad, 1, 1);
-      } else if (prevOutput_ && weight_->getWGrad()) {
-        weight_->getWGrad()->mul(
-            *prevBatchOutput2_->getTranspose(), *gateGrad, 1, 1);
-      }
-    }
-  }
-
-  if (inputGrad) {
-    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, /* scale */ 1);
-  }
-}
-
-void LstmLayer::forwardSeqParallel(int batchSize,
-                                   size_t numSequences,
-                                   const int *starts,
-                                   MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwSeqParallelTime", getName().c_str());
-  gate_.value->assign(*inputValue);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, /* scale */ 1);
-  }
-
-  real *gateValue = gate_.value->getData();
-  real *stateValue = state_.value->getData();
-  real *outputValue = output_.value->getData();
-  real *preOutputValue = preOutput_.value->getData();
-  real *checkIg = checkIg_->getData();
-  real *checkFg = checkFg_->getData();
-  real *checkOg = checkOg_->getData();
-  real *weight = weight_->getW()->getData();
-  hl_lstm_parallel_forward(gateValue,
-                           stateValue,
-                           preOutputValue,
-                           outputValue,
-                           checkIg,
-                           checkFg,
-                           checkOg,
-                           weight,
-                           starts,
-                           getSize(),
-                           numSequences,
-                           reversed_,
-                           activeNode_,
-                           activeGate_,
-                           activeState_);
-}
-
-void LstmLayer::backwardSeqParallel(int batchSize,
-                                    size_t numSequences,
-                                    const int *starts,
-                                    MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwSeqParallelTime", getName().c_str());
-  real *gateValue = gate_.value->getData();
-  real *gateGrad = gate_.grad->getData();
-  real *stateValue = state_.value->getData();
-  real *stateGrad = state_.grad->getData();
-  real *preOutputValue = preOutput_.value->getData();
-  real *preOutputGrad = preOutput_.grad->getData();
-  real *checkIg = checkIg_->getData();
-  real *checkFg = checkFg_->getData();
-  real *checkOg = checkOg_->getData();
-  real *outputGrad = output_.grad->getData();
-  real *weight = weight_->getW()->getData();
-
-  real *checkIgGrad;
-  real *checkFgGrad;
-  real *checkOgGrad;
-  if (bias_->getWGrad()) {
-    checkIgGrad = checkIgGrad_->getData();
-    checkFgGrad = checkFgGrad_->getData();
-    checkOgGrad = checkOgGrad_->getData();
-  } else {
-    checkIgGrad = nullptr;
-    checkFgGrad = nullptr;
-    checkOgGrad = nullptr;
-  }
-
-  hl_lstm_parallel_backward_data(gateValue,
-                                 gateGrad,
-                                 stateValue,
-                                 stateGrad,
-                                 preOutputValue,
-                                 preOutputGrad,
-                                 outputGrad,
-                                 checkIg,
-                                 checkIgGrad,
-                                 checkFg,
-                                 checkFgGrad,
-                                 checkOg,
-                                 checkOgGrad,
-                                 weight,
-                                 starts,
-                                 getSize(),
-                                 numSequences,
-                                 reversed_,
-                                 activeNode_,
-                                 activeGate_,
-                                 activeState_);
-
-  if (inputGrad) {
-    inputGrad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-  }
-
-  real *outputValue = output_.value->getData();
-  if (weight_->getWGrad()) {
-    real *weightGrad = weight_->getWGrad()->getData();
-    hl_lstm_parallel_backward_weight(weightGrad,
-                                     outputValue,
-                                     gateGrad,
-                                     starts,
-                                     getSize(),
-                                     batchSize,
-                                     numSequences,
-                                     reversed_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
deleted file mode 100644
index 76dfe8146bf67a0b7b4fd4835851fae6ac38d80f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LstmLayer.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "LstmCompute.h"
-#include "SequenceToBatch.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-namespace paddle {
-
-/**
- * @brief LstmLayer takes 1 input layer with size * 4.
- * Input layer is diveded into 4 equal parts:
- *   (input_s, input_ig, input_fg, input_og)
- *
- * For each sequence [start, end] it performs the following computation:
- * @code
- * output_{i} = actState(state_{i}) * actGate(outputGate_{i})
- * state_{i} = actInput(input_s_{i} + bias_s +
- *             output_{i-1} * recurrIW) * actGate(inputGate_{i}) +
- *             actGate(forgetGate_{i}) * state_{i-1}
- * inputGate = input_ig_{i} + bias_ig + output_{i-1} * recurrIGW +
- *             state_{i-1} * inputCheck
- * ouputGate = input_og_{i} + bias_og + output_{i-1} * recurrOGW +
- *             state_{i} * outputCheck
- * forgetGate = input_fg_{i} + bias_fg + output_{i-1} * recurrFGW +
- *              state_{i-1} * forgetCheck
- * @endcode
- *
- * - parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW)
- * - baisParameter consists of
- *   (bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck)
- *
- * - actInput is defined by config active_type.
- * - actState is defined by config active_state_type.
- * - actGate is defined by config actvie_gate_type.
- *
- * There are two ways to compute, namely one sequence by one sequence or
- * one batch by one batch. By default and no setting pre_batch_state true,
- * it will compute batch by batch.
- *
- * The formula in the paper is as follows:
- * \f[
- * i_t = \sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) \\
- * f_t = \sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) \\
- * \tilde{c_t} = tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) \\
- * o_t = \sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) \\
- * c_t = f_t * c_{t-1} + i_t * \tilde{c_t} \\
- * h_t = o_t tanh(c_t)
- * \f]
- *
- * @note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
- * operations on the input sequence were NOT included in LstmLayer. So
- * users should use fc_layer or mixed_layer before lstm_later.
- *
- * The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
- * The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
- */
-
-class LstmLayer : public Layer, public LstmCompute {
- public:
-  explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
-
-  bool init(const LayerMap &layerMap,
-            const ParameterMap &parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback &callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
- protected:
-  /**
-   * @brief Compute lstm forward one sequence by one sequence.
-   * @param batchSize The batchSize is not equal to the batch_size in
-   * the config file. It is the total words number of all samples
-   * in this forward batch.
-   * @param numSequences The sample number. It is equal to the batch_size
-   * in the config file.
-   * @param starts Each start position of each samples.
-   * @param inputValue The input values.
-   */
-  void forwardSequence(int batchSize,
-                       size_t numSequences,
-                       const int *starts,
-                       MatrixPtr inputValue);
-  /**
-   * Compute lstm backward one sequence by one sequence.
-   */
-  void backwardSequence(int batchSize,
-                        size_t numSequences,
-                        const int *starts,
-                        MatrixPtr inputGrad);
-
-  /**
-   * Compute lstm forward one batch by one batch. The batch value is
-   * reorganized by SequenceToBatch class. The batch output value will
-   * be convert into sequence value after finishing forward. Here, one
-   * batch contains one word of each sample. If the length of each sample
-   * is not equality, the batch will not pads zero and contains less words.
-   * The total batch numbers are the max length of the sequence. The details
-   * can refer to SequenceToBatch class. On GPU mode, it will launch GPU
-   * kernel for loop.
-   *
-   * @code
-   * for (int i = 0; i < numBatch(max_sequence_length); ++i) {
-   *   compute one batch.
-   * }
-   * @endcode
-   */
-  void forwardBatch(int batchSize,
-                    size_t numSequences,
-                    const int *starts,
-                    MatrixPtr inputValue);
-  /**
-   * Compute lstm backward one batch by one batch.
-   */
-  void backwardBatch(int batchSize,
-                     size_t numSequences,
-                     const int *starts,
-                     MatrixPtr inputGrad);
-
-  /**
-   * This function only supports GPU. It not need to reorganize input into
-   * batch value. It will launch one kernel to parallelly compute forward
-   * propagation in sequence level.
-   */
-  void forwardSeqParallel(int batchSize,
-                          size_t numSequences,
-                          const int *starts,
-                          MatrixPtr inputValue);
-  /**
-   * Backward propagation corresponding to forwardSeqParallel.
-   */
-  void backwardSeqParallel(int batchSize,
-                           size_t numSequences,
-                           const int *starts,
-                           MatrixPtr inputGrad);
-  /**
-   * This function is used for sequence generation and get output after
-   * forwardBatch.
-   */
-  void getPrevBatchOutput(size_t numSequences);
-  /**
-   * This function is used for sequence generation and get state after
-   * forwardBatch.
-   */
-  void getPrevBatchState(size_t numSequences);
-
- protected:
-  /// Learned parameters, shape: (size, 4*size).
-  /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
-  std::unique_ptr<Weight> weight_;
-  /// Learned bias parameter, shape: (1, 7 * size).
-  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf},
-  /// W_{co}\f$.
-  std::unique_ptr<Weight> bias_;
-  /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
-  MatrixPtr localBias_;
-  /// The peephole connection for input gate.
-  MatrixPtr checkIg_;
-  /// The peephole connection for forget gate.
-  MatrixPtr checkFg_;
-  /// The peephole connection for output gate.
-  MatrixPtr checkOg_;
-  /// The gradient of real bias
-  MatrixPtr localBiasGrad_;
-  /// The gradient of peephole connection for input gates.
-  MatrixPtr checkIgGrad_;
-  /// The gradient of peephole connection for forget gates.
-  MatrixPtr checkFgGrad_;
-  /// The gradient of peephole connection for output gates.
-  MatrixPtr checkOgGrad_;
-
-  /// Stores the cell state of previous time step, namely \f$c_{t-1}\f$.
-  Argument state_;
-  /// Stores the hidden of previous time step, namely \f$h_{t-1}\f$.
-  Argument preOutput_;
-  /// Stores the value and gradient of four gates, namely
-  /// \f$i_t, f_t, o_t, c_t\f$.
-  Argument gate_;
-  /// Whether it is reversed lstm.
-  bool reversed_;
-  /// Whether to use batch method to compute.
-  bool useBatch_;
-  /// Whether to use sequence parallell method to compute.
-  bool useSeqParallel_;
-  /// batchValue_ is used in method of batch calculation. It stores the
-  /// batch value after reorganized input.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// The gradient of batchValue_.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-
-  /// Used in generation and stores the state of previous time step.
-  MatrixPtr prevState_;
-  /// Used in generation and stores the output of previous time step.
-  MatrixPtr prevOutput_;
-  MatrixPtr prevBatchOutput2_;
-  /// The total state.
-  MatrixPtr totalState_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
deleted file mode 100644
index 22c28157c5a5b19aa54b3151a6c9a4cdcfb01765..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ /dev/null
@@ -1,769 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmLayer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-class CoordIterator {
- public:
-  std::vector<int> dims_;
-  std::vector<bool> directions_;
-  std::vector<int> curPos_;
-  bool end_;
-
-  void step(size_t d, bool reversed) {
-    if (directions_[d] ^ reversed) {
-      if (curPos_[d] == dims_[d] - 1) {
-        curPos_[d] = 0;
-        if (d) {
-          step(d - 1, reversed);
-        } else {
-          end_ = true;
-        }
-      } else {
-        curPos_[d]++;
-      }
-    } else {
-      if (curPos_[d] == 0) {
-        curPos_[d] = dims_[d] - 1;
-        if (d) {
-          step(d - 1, reversed);
-        } else {
-          end_ = true;
-        }
-      } else {
-        curPos_[d]--;
-      }
-    }
-  }
-
- public:
-  CoordIterator(std::vector<int> dim, std::vector<bool> directions)
-      : dims_(dim), directions_(directions), end_(false) {
-    CHECK_EQ(dims_.size(), directions_.size());
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_.push_back(-1);
-    }
-  }
-  CoordIterator& operator++() {
-    step(dims_.size() - 1, false);
-    return *this;
-  }
-
-  CoordIterator& operator--() {
-    step(dims_.size() - 1, true);
-    return *this;
-  }
-
-  std::vector<int>& curPos() { return curPos_; }
-
-  int offset() {
-    int offset = curPos_[0];
-    for (size_t i = 1; i < dims_.size(); i++) {
-      offset = offset * dims_[i] + curPos_[i];
-    }
-    return offset;
-  }
-
-  int offset(const std::vector<int>& pos) {
-    int offset = pos[0];
-    for (size_t i = 1; i < dims_.size(); i++) {
-      offset = offset * dims_[i] + pos[i];
-    }
-    return offset;
-  }
-
-  std::vector<int>& begin() {
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_[i] = directions_[i] ? 0 : dims_[i] - 1;
-    }
-    end_ = false;
-    return curPos_;
-  }
-
-  std::vector<int>& rbegin() {
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_[i] = directions_[i] ? dims_[i] - 1 : 0;
-    }
-    end_ = false;
-    return curPos_;
-  }
-
-  bool end() { return end_; }
-
-  bool getPrePos(const std::vector<int>& delays,
-                 int idx,
-                 std::vector<int>& prePos) {
-    bool isAvial = true;
-    prePos.clear();
-    prePos.reserve(directions_.size());
-    for (size_t i = 0; i < directions_.size(); i++) {
-      if (int(i) == idx) {
-        prePos.push_back(curPos_[i] + delays[i] * (directions_[i] ? 1 : -1));
-        if (prePos[i] < 0) {
-          prePos[i] = 0;
-          isAvial = false;
-        }
-        if (prePos[i] >= dims_[i]) {
-          prePos[i] = dims_[i] - 1;
-          isAvial = false;
-        }
-      } else {
-        prePos.push_back(curPos_[i]);
-      }
-    }
-    return isAvial;
-  }
-
-  bool getNextPos(const std::vector<int>& delays,
-                  int idx,
-                  std::vector<int>& nextPos) {
-    bool isAvial = true;
-    nextPos.clear();
-    nextPos.reserve(directions_.size());
-    for (size_t i = 0; i < directions_.size(); i++) {
-      if (int(i) == idx) {
-        nextPos.push_back(curPos_[i] - delays[i] * (directions_[i] ? 1 : -1));
-        if (nextPos[i] < 0) {
-          nextPos[i] = 0;
-          isAvial = false;
-        }
-        if (nextPos[i] >= dims_[i]) {
-          nextPos[i] = dims_[i] - 1;
-          isAvial = false;
-        }
-      } else {
-        nextPos.push_back(curPos_[i]);
-      }
-    }
-    return isAvial;
-  }
-};
-/*
- * MDLstmLayer takes 1 input layer with size * (3+numDims).
- * For each sequence [start, end] it performs the following computation:
- * out_i = actState(state_i) * actGate(outputGate_i)
- *
- * For example the image with 2 dims, we take the scanning order from left-top
- * to right-bottom, then the 2 previous states of the current pixels are the
- * ones located at left and top. And each of them has a independent forget gate.
- *
- * state_i = actInput(input_i) * actGate(inputGate_i) +
- *           \sum{j}(actGate(forgetGate_i_j) * state_prev_i_j)
- *
- * inputGate = input_i * inputW + \sum{j}(output_prev_i_j * recurrInputW_j) +
- *             \sum{j}(state_prev_i_j * inputCheck_j)
- *
- * ouputGate = input_i * outputW + \sum{j}(output_prev_i_j * recurrOutputW_j) +
- *             state_i * outputCheck
- *
- * forgetGate_j = input_i * forgetW_j + \sum{j}(output_prev_i_j *
- *                recurrForgetW_j) + \sum{j}(state_prev_i_j * forgetCheck_j)
- *
- * IG Layer: (Input, InputGate, ForgetGates, OutputGate) * OutputSize
- * */
-
-class MDLstmLayer : public LstmLayer {
- public:
-  explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
- protected:
-  void forwardOneSequence(int start, CoordIterator& coordIter);
-  void backwardOneSequence(int start, CoordIterator& coordIter);
-  void forwardGate2OutputSequence(int start, CoordIterator& coordIter);
-  void backwardGate2OutputSequence(int start, CoordIterator& coordIter);
-
- protected:
-  std::vector<Argument> frameInputGate_;
-  std::vector<Argument> frameForgetGate_;
-  std::vector<Argument> frameOutputGate_;
-  std::vector<Argument> frameInputNode_;
-  std::vector<Argument> frameGate_;
-  std::vector<Argument> frameState_;
-  std::vector<Argument> framePreOutput_;
-  std::vector<Argument> frameOutput_;
-
-  // Activation
-  std::unique_ptr<ActivationFunction> activationGate_;
-  std::unique_ptr<ActivationFunction> activationState_;
-
-  int numDims_;
-  size_t numBlocks_;
-  std::vector<bool> directions_;
-  std::vector<int> delays_;
-  std::vector<std::vector<int>> dimsV_;
-};
-
-REGISTER_LAYER(mdlstmemory, MDLstmLayer);
-
-bool MDLstmLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-
-  numBlocks_ = getSize();
-  numDims_ = config_.directions_size();
-  CHECK_EQ(numBlocks_ * numBlocks_ * (3 + numDims_), parameters_[0]->getSize());
-
-  // inode(1), ig(1), fg(numDims_), og(1), peepIg(1), peepFg(numDims_),
-  // peepOg(1), then size of localBias_ is 3+numDims_
-  CHECK_EQ(numBlocks_ * (5 + 2 * numDims_), biasParameter_->getSize());
-  weight_.reset(
-      new Weight(numBlocks_, numBlocks_ * (3 + numDims_), parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, numBlocks_ * (5 + 2 * numDims_), biasParameter_));
-    localBias_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                numBlocks_ * (3 + numDims_),
-                                /* trans= */ false,
-                                useGpu_);
-    checkIg_ = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    checkFg_ = Matrix::create(nullptr,
-                              /* height= */ numDims_,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    checkOg_ = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    localBiasGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    numBlocks_ * (3 + numDims_),
-                                    /* trans= */ false,
-                                    useGpu_);
-    checkIgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-    checkFgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ numDims_,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-    checkOgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-
-    localBias_->setData(bias_->getW()->getData());
-    checkIg_->setData(bias_->getW()->getData() + numBlocks_ * (3 + numDims_));
-    checkFg_->setData(bias_->getW()->getData() + numBlocks_ * (4 + numDims_));
-    checkOg_->setData(bias_->getW()->getData() +
-                      numBlocks_ * (4 + 2 * numDims_));
-
-    if (bias_->getWGrad()) {
-      localBiasGrad_->setData(bias_->getWGrad()->getData());
-      checkIgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (3 + numDims_));
-      checkFgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (4 + numDims_));
-      checkOgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (4 + 2 * numDims_));
-    }
-  } else {
-    LOG(FATAL) << "Bias should be here.";
-  }
-  for (int i = 0; i < numDims_; i++) {
-    directions_.push_back(config_.directions(i));
-  }
-  for (int i = 0; i < numDims_; i++) {
-    delays_.push_back(-1);
-  }
-  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
-  activationState_.reset(
-      ActivationFunction::create(config_.active_state_type()));
-
-  return true;
-}
-
-void MDLstmLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  int numSequences = input.getNumSequences();
-  resetOutput(batchSize, numBlocks_);
-  CHECK_EQ(numBlocks_ * (3 + numDims_), input.value->getWidth());
-  const int* starts = input.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  int* dimsData = input.cpuSequenceDims->getData();
-  CHECK_EQ(int(input.cpuSequenceDims->getSize()), numDims_* numSequences);
-
-  for (int i = 0; i < numSequences; i++) {
-    std::vector<int> dims;
-    for (int j = 0; j < numDims_; j++) {
-      dims.push_back(dimsData[i * numDims_ + j]);
-    }
-    dimsV_.push_back(dims);
-  }
-
-  frameInputGate_.reserve(batchSize);
-  frameForgetGate_.reserve(batchSize);
-  frameOutputGate_.reserve(batchSize);
-  frameInputNode_.reserve(batchSize);
-  frameGate_.reserve(batchSize);
-  frameState_.reserve(batchSize);
-  framePreOutput_.reserve(batchSize);
-  frameOutput_.reserve(batchSize);
-
-  Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize,
-                         numBlocks_ * (3 + numDims_),
-                         /* trans= */ false,
-                         useGpu_);
-
-  for (int i = frameGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_ * (3 + numDims_),
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_ * (3 + numDims_),
-                              /* trans= */ false,
-                              useGpu_);
-    frameGate_.push_back(arg);
-  }
-  for (int i = frameInputGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameInputGate_.push_back(arg);
-  }
-  for (int i = frameForgetGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ numDims_,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ numDims_,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameForgetGate_.push_back(arg);
-  }
-  for (int i = frameOutputGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameOutputGate_.push_back(arg);
-  }
-  for (int i = frameInputNode_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameInputNode_.push_back(arg);
-  }
-  for (int i = frameState_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(
-        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-    frameState_.push_back(arg);
-  }
-  for (int i = framePreOutput_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(
-        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-    framePreOutput_.push_back(arg);
-  }
-  for (int i = frameOutput_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameOutput_.push_back(arg);
-  }
-
-  for (int i = 0; i < batchSize; i++) {
-    frameOutput_[i].value->setData(output_.value->getData() + i * numBlocks_);
-    frameGate_[i].value->setData(gate_.value->getData() +
-                                 i * numBlocks_ * (3 + numDims_));
-    frameInputNode_[i].value->setData(gate_.value->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 0);
-    frameInputGate_[i].value->setData(gate_.value->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 1);
-    frameForgetGate_[i].value->setData(gate_.value->getData() +
-                                       i * numBlocks_ * (3 + numDims_) +
-                                       numBlocks_ * 2);
-    frameOutputGate_[i].value->setData(gate_.value->getData() +
-                                       i * numBlocks_ * (3 + numDims_) +
-                                       numBlocks_ * (2 + numDims_));
-  }
-
-  AsyncGpuBlock asyncGpuBlock;
-  gate_.value->assign(*input.value);
-
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  for (int i = 0; i < numSequences; i++) {
-    CoordIterator coordIter(dimsV_[i], directions_);
-    forwardOneSequence(starts[i], coordIter);
-  }
-}
-
-void MDLstmLayer::forwardGate2OutputSequence(int start,
-                                             CoordIterator& coordIter) {
-  int idxCurr = start + coordIter.offset();
-  std::vector<int> preOffsetV;
-  preOffsetV.reserve(numDims_);
-  for (int i = 0; i < numDims_; i++) {
-    std::vector<int> prePos;
-    if (coordIter.getPrePos(delays_, i, prePos)) {
-      preOffsetV[i] = coordIter.offset(prePos);
-    } else {
-      preOffsetV[i] = -1;
-    }
-  }
-
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      frameInputGate_[idxCurr].value->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *checkIg_, 1.0, 1.0);
-
-      MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr checkFgOneDim =
-          Matrix::create(checkFg_->getData() + i * numBlocks_,
-                         1.0,
-                         numBlocks_,
-                         false,
-                         useGpu_);
-      fgGateOneDim->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
-    }
-  }
-  auto status = activationGate_->forward(frameInputGate_[idxCurr]);
-  status.check();
-  status = activationGate_->forward(frameForgetGate_[idxCurr]);
-  status.check();
-  status = activation_->forward(frameInputNode_[idxCurr]);
-  status.check();
-
-  frameState_[idxCurr].value->zeroMem();
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      frameState_[idxCurr].value->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *fgGateOneDim, 1.0, 1.0);
-    }
-  }
-  frameState_[idxCurr].value->addDotMul(*frameInputNode_[idxCurr].value,
-                                        *frameInputGate_[idxCurr].value,
-                                        1.0,
-                                        1.0);
-
-  frameOutputGate_[idxCurr].value->addDotMul(
-      *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
-  status = activationGate_->forward(frameOutputGate_[idxCurr]);
-  status.check();
-
-  framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
-  status = activationState_->forward(framePreOutput_[idxCurr]);
-  status.check();
-
-  frameOutput_[idxCurr].value->dotMul(*framePreOutput_[idxCurr].value,
-                                      *frameOutputGate_[idxCurr].value);
-}
-
-void MDLstmLayer::forwardOneSequence(int start, CoordIterator& coordIter) {
-  for (coordIter.begin(); !coordIter.end(); ++coordIter) {
-    int offset = coordIter.offset();
-    for (int i = 0; i < numDims_; i++) {
-      std::vector<int> prePos;
-      if (coordIter.getPrePos(delays_, i, prePos)) {
-        int preOffset = coordIter.offset(prePos);
-        frameGate_[start + offset].value->mul(
-            *frameOutput_[start + preOffset].value, *weight_->getW(), 1.0, 1.0);
-      }
-    }
-    forwardGate2OutputSequence(start, coordIter);
-  }
-}
-
-void MDLstmLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  size_t numSequences = input.getNumSequences();
-
-  Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize,
-                         numBlocks_ * (3 + numDims_),
-                         /* trans= */ false,
-                         useGpu_);
-
-  for (int i = 0; i < batchSize; i++) {
-    if (frameState_[i].grad == NULL)
-      frameState_[i].grad = Matrix::create(
-          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-  }
-  for (int i = 0; i < batchSize; i++) {
-    if (framePreOutput_[i].grad == NULL)
-      framePreOutput_[i].grad = Matrix::create(
-          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-  }
-
-  for (int i = 0; i < batchSize; i++) {
-    frameOutput_[i].grad->setData(output_.grad->getData() + i * numBlocks_);
-    frameGate_[i].grad->setData(gate_.grad->getData() +
-                                i * numBlocks_ * (3 + numDims_));
-    frameInputNode_[i].grad->setData(gate_.grad->getData() +
-                                     i * numBlocks_ * (3 + numDims_) +
-                                     numBlocks_ * 0);
-    frameInputGate_[i].grad->setData(gate_.grad->getData() +
-                                     i * numBlocks_ * (3 + numDims_) +
-                                     numBlocks_ * 1);
-    frameForgetGate_[i].grad->setData(gate_.grad->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 2);
-    frameOutputGate_[i].grad->setData(gate_.grad->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * (2 + numDims_));
-  }
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-
-    for (size_t i = 0; i < numSequences; i++) {
-      CoordIterator coordIter(dimsV_[i], directions_);
-      backwardOneSequence(starts[i], coordIter);
-    }
-  }
-
-  if (input.grad) {
-    input.grad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void MDLstmLayer::backwardGate2OutputSequence(int start,
-                                              CoordIterator& coordIter) {
-  int idxCurr = start + coordIter.offset();
-  std::vector<int> preOffsetV;
-  std::vector<int> nextOffsetV;
-  preOffsetV.reserve(numDims_);
-  nextOffsetV.reserve(numDims_);
-  for (int i = 0; i < numDims_; i++) {
-    std::vector<int> prePos;
-    if (coordIter.getPrePos(delays_, i, prePos)) {
-      preOffsetV[i] = coordIter.offset(prePos);
-    } else {
-      preOffsetV[i] = -1;
-    }
-    std::vector<int> nextPos;
-    if (coordIter.getNextPos(delays_, i, nextPos)) {
-      nextOffsetV[i] = coordIter.offset(nextPos);
-    } else {
-      nextOffsetV[i] = -1;
-    }
-  }
-
-  framePreOutput_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
-                                        *frameOutputGate_[idxCurr].value);
-  activationState_->backward(framePreOutput_[idxCurr]).check();
-  frameState_[idxCurr].grad->copyFrom(*(framePreOutput_[idxCurr].grad));
-
-  frameOutputGate_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
-                                         *framePreOutput_[idxCurr].value);
-  activationGate_->backward(frameOutputGate_[idxCurr]).check();
-
-  frameState_[idxCurr].grad->addDotMul(
-      *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
-  for (int i = 0; i < numDims_; i++) {
-    if (nextOffsetV[i] >= 0) {
-      frameState_[idxCurr].grad->addDotMul(
-          *frameInputGate_[start + nextOffsetV[i]].grad, *checkIg_, 1.0, 1.0);
-
-      MatrixPtr fgGateOneDimGrad = Matrix::create(
-          frameForgetGate_[start + nextOffsetV[i]].grad->getData() +
-              i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr fgGateOneDimVal = Matrix::create(
-          frameForgetGate_[start + nextOffsetV[i]].value->getData() +
-              i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr checkFgOneDim = Matrix::create(
-          checkFg_->getData() + i * numBlocks_, 1, numBlocks_, false, useGpu_);
-
-      frameState_[idxCurr].grad->addDotMul(
-          *fgGateOneDimGrad, *checkFgOneDim, 1.0, 1.0);
-      frameState_[idxCurr].grad->addDotMul(
-          *frameState_[start + nextOffsetV[i]].grad,
-          *fgGateOneDimVal,
-          1.0,
-          1.0);
-    }
-  }
-
-  frameInputNode_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
-                                        *frameInputGate_[idxCurr].value);
-  frameInputGate_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
-                                        *frameInputNode_[idxCurr].value);
-
-  frameForgetGate_[idxCurr].grad->zeroMem();
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      MatrixPtr fgGateOneDimGrad = Matrix::create(
-          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      fgGateOneDimGrad->addDotMul(*frameState_[idxCurr].grad,
-                                  *frameState_[start + preOffsetV[i]].value,
-                                  1.0,
-                                  1.0);
-    }
-  }
-
-  activationGate_->backward(frameInputGate_[idxCurr]).check();
-  activationGate_->backward(frameForgetGate_[idxCurr]).check();
-  activation_->backward(frameInputNode_[idxCurr]).check();
-
-  if (bias_->getWGrad()) {
-    for (int i = 0; i < numDims_; i++) {
-      if (preOffsetV[i] >= 0) {
-        checkIgGrad_->addDotMul(*frameInputGate_[idxCurr].grad,
-                                *frameState_[start + preOffsetV[i]].value,
-                                1.0,
-                                1.0);
-
-        MatrixPtr fgGateOneDimGrad = Matrix::create(
-            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
-            1,
-            numBlocks_,
-            false,
-            useGpu_);
-        MatrixPtr checkFgOneDimGrad =
-            Matrix::create(checkFgGrad_->getData() + i * numBlocks_,
-                           1,
-                           numBlocks_,
-                           false,
-                           useGpu_);
-        checkFgOneDimGrad->addDotMul(*fgGateOneDimGrad,
-                                     *frameState_[start + preOffsetV[i]].value,
-                                     1.0,
-                                     1.0);
-      }
-    }
-    checkOgGrad_->addDotMul(
-        *frameOutputGate_[idxCurr].grad, *frameState_[idxCurr].value, 1.0, 1.0);
-  }
-}
-
-void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) {
-  MatrixPtr weightT = weight_->getW()->getTranspose();
-  for (coordIter.rbegin(); !coordIter.end(); --coordIter) {
-    int offset = coordIter.offset();
-    backwardGate2OutputSequence(start, coordIter);
-    for (int i = 0; i < numDims_; i++) {
-      std::vector<int> prePos;
-      if (coordIter.getPrePos(delays_, i, prePos)) {
-        int preOffset = coordIter.offset(prePos);
-        frameOutput_[start + preOffset].grad->mul(
-            *frameGate_[start + offset].grad, *weightT, 1.0, 1.0);
-        if (weight_->getWGrad()) {
-          weight_->getWGrad()->mul(
-              *frameOutput_[start + preOffset].value->getTranspose(),
-              *frameGate_[start + offset].grad,
-              1.0,
-              1.0);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
deleted file mode 100644
index a442a0a01369f4ceb27ba4a1976df7f6e25b832f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ /dev/null
@@ -1,388 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNConvLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_conv, MKLDNNConvLayer);
-
-bool MKLDNNConvLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK(config_.shared_biases()) << "Only support shared biases yet";
-
-  oc_ = config_.num_filters();
-  const ConvConfig& conf = config_.inputs(0).conv_conf();
-  ic_ = conf.channels();
-  fw_ = conf.filter_size();
-  fh_ = conf.filter_size_y();
-  pw_ = conf.padding();
-  ph_ = conf.padding_y();
-  dw_ = conf.dilation();
-  dh_ = conf.dilation_y();
-  sw_ = conf.stride();
-  sh_ = conf.stride_y();
-  gp_ = conf.groups();
-  oh_ = conf.output_y();
-  ow_ = conf.output_x();
-  ih_ = conf.img_size_y();
-  iw_ = conf.img_size();
-  caffeMode_ = conf.caffe_mode();
-  CHECK(caffeMode_) << "Only support caffe mode yet";
-  CHECK(dh_ == 1 && dw_ == 1) << "Only support dilation 1 yet";
-  // check group setting
-  CHECK_EQ((oc_ / gp_) * gp_, oc_) << "group is indivisible for oc";
-  CHECK_EQ((ic_ / gp_) * gp_, ic_) << "group is indivisible for ic";
-
-  // create weight
-  size_t height = oc_ / gp_;
-  size_t width = ic_ * fh_ * fw_;
-  CHECK_EQ(parameters_[0]->getSize(), height * width);
-  weight_ =
-      std::unique_ptr<Weight>(new Weight(height, width, parameters_[0], 0));
-
-  // create biases
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
-  }
-  return true;
-}
-
-void MKLDNNConvLayer::convertWeightsFromPaddle() {
-  if (hasInitedWgt_) {
-    return;
-  }
-
-  CHECK(wgtVal_) << "should have been initialized";
-  // the paddle weight format is oihw or goihw
-  auto targetDim = wgtVal_->getDims();
-  auto srcFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
-  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
-  hasInitedWgt_ = true;
-}
-
-void MKLDNNConvLayer::convertWeightsToPaddle() {
-  CHECK(wgtVal_) << "should have been initialized";
-  auto targetDim = wgtVal_->getDims();
-  auto dstFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
-  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
-}
-
-void MKLDNNConvLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-
-  // cal output sizes
-  // oc can not be changed
-  int fh = (fh_ - 1) * dh_ + 1;
-  int fw = (fw_ - 1) * dw_ + 1;
-  oh = outputSize(ih, fh, ph_, sh_, caffeMode_);
-  ow = outputSize(iw, fw, pw_, sw_, caffeMode_);
-
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  resetFwdPD(fwdPD_);
-
-  resetFwdBuffers(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-}
-
-void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
-  std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
-
-  resetBwdWgtPD(bwdWgtPD);
-
-  resetBwdDataPD(bwdDataPD);
-
-  resetBwdBuffers(bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
-
-  resetBwdPipeline(
-      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
-}
-
-void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
-  weight_->getParameterPtr()->incUpdate(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void MKLDNNConvLayer::loadConvSettings(memory::dims& wgt,
-                                       memory::dims& bias,
-                                       memory::dims& stride,
-                                       memory::dims& dilation,
-                                       memory::dims& padL,
-                                       memory::dims& padR) {
-  wgt = (gp_ == 1) ? memory::dims{oc_, ic_, fh_, fw_}
-                   : memory::dims{gp_, oc_ / gp_, ic_ / gp_, fh_, fw_};
-  bias = memory::dims{oc_};
-  stride = memory::dims{sh_, sw_};
-  padL = memory::dims{ph_, pw_};
-  padR = getPaddingR();
-  // note: mkldnn dilation start from 0
-  dilation = memory::dims{dh_ - 1, dw_ - 1};
-}
-
-void MKLDNNConvLayer::resetFwdPD(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd) {
-  // dims for conv
-  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  algorithm algo = algorithm::convolution_direct;
-  padding_kind padKind = padding_kind::zero;
-  conv_fwd::desc fwdDesc =
-      biases_ && biases_->getW()
-          ? conv_fwd::desc(pk,
-                           algo,
-                           MKLDNNMatrix::createMemoryDesc(inDims),
-                           MKLDNNMatrix::createMemoryDesc(wgtDims),
-                           MKLDNNMatrix::createMemoryDesc(biasDims),
-                           MKLDNNMatrix::createMemoryDesc(outDims),
-                           strides,
-                           dilations,
-                           padL,
-                           padR,
-                           padKind)
-          : conv_fwd::desc(pk,
-                           algo,
-                           MKLDNNMatrix::createMemoryDesc(inDims),
-                           MKLDNNMatrix::createMemoryDesc(wgtDims),
-                           MKLDNNMatrix::createMemoryDesc(outDims),
-                           strides,
-                           dilations,
-                           padL,
-                           padR,
-                           padKind);
-  pd.reset(new conv_fwd::primitive_desc(fwdDesc, engine_));
-}
-
-void MKLDNNConvLayer::resetFwdBuffers(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(pd);
-  resetInValue(
-      in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
-
-  resetOutValue(out, pd->dst_primitive_desc());
-
-  resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
-
-  if (biases_ && biases_->getW()) {
-    resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNConvLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<conv_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  if (bias) {
-    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
-  } else {
-    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
-  }
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNConvLayer::resetBwdWgtPD(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& pd) {
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-
-  // create backward weight using input, output and weight value memory desc
-  CHECK(inVals_[0]) << "Should have internal input value";
-  CHECK(outVal_) << "Should have internal output value";
-  CHECK(wgtVal_) << "Should have weight value";
-  algorithm algo = algorithm::convolution_direct;
-  padding_kind padKind = padding_kind::zero;
-  auto bwdWgtDesc = biasVal_ != nullptr
-                        ? conv_bwdWgt::desc(algo,
-                                            inVals_[0]->getMemoryDesc(),
-                                            wgtVal_->getMemoryDesc(),
-                                            biasVal_->getMemoryDesc(),
-                                            outVal_->getMemoryDesc(),
-                                            strides,
-                                            padL,
-                                            padR,
-                                            padKind)
-                        : conv_bwdWgt::desc(algo,
-                                            inVals_[0]->getMemoryDesc(),
-                                            wgtVal_->getMemoryDesc(),
-                                            outVal_->getMemoryDesc(),
-                                            strides,
-                                            padL,
-                                            padR,
-                                            padKind);
-  pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
-  CHECK_PRIMITIVE_DESC_EQ(inVals_[0], pd->src_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(
-      outVal_,
-      pd->diff_dst_primitive_desc(),
-      "primitive desc of out value and grad should be equal");
-  CHECK_PRIMITIVE_DESC_EQ(
-      wgtVal_,
-      pd->diff_weights_primitive_desc(),
-      "primitive desc of weight value and grad should be equal");
-}
-
-void MKLDNNConvLayer::resetBwdDataPD(
-    std::shared_ptr<conv_bwdData::primitive_desc>& pd) {
-  pd = nullptr;
-  if (inputLayers_[0]->getOutput().grad == nullptr) {
-    return;
-  }
-
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-  CHECK(inVals_[0]) << "Should have internal input value";
-  CHECK(outVal_) << "Should have internal output value";
-  // create backward data using input and output value memory desc
-  // but using weight memory desc with any format
-  auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
-                                        inVals_[0]->getMemoryDesc(),
-                                        MKLDNNMatrix::createMemoryDesc(wgtDims),
-                                        outVal_->getMemoryDesc(),
-                                        strides,
-                                        padL,
-                                        padR,
-                                        padding_kind::zero);
-  pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
-  CHECK_PRIMITIVE_DESC_EQ(
-      inVals_[0],
-      pd->diff_src_primitive_desc(),
-      "primitive desc of in value and grad should be equal");
-  CHECK_PRIMITIVE_DESC_EQ(
-      outVal_,
-      pd->diff_dst_primitive_desc(),
-      "primitive desc of out value and grad should be equal");
-}
-
-void MKLDNNConvLayer::resetBwdBuffers(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(wgtPD);
-  resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
-
-  resetWithMatrix(
-      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(
-      wgtVal_,
-      wgt->getPrimitiveDesc(),
-      "primitive desc of weight grad and value should be equal");
-
-  bias = nullptr;
-  if (biases_ && biases_->getWGrad()) {
-    resetWithMatrix(
-        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
-    CHECK(bias);
-    CHECK_PRIMITIVE_DESC_EQ(
-        biasVal_,
-        bias->getPrimitiveDesc(),
-        "primitive desc of bias grad and value should be equal");
-  }
-
-  if (dataPD == nullptr) {
-    return;
-  }
-  resetInGrad(in, dataPD->diff_src_primitive_desc());
-  resetWgtValBwdData(dataPD, wgtValBwdData_);
-}
-
-void MKLDNNConvLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0]);
-  // add bwdWgt handle
-  if (bias) {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt, *bias));
-  } else {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt));
-  }
-  pipeline.push_back(*bwdWgt_);
-
-  if (dataPD == nullptr) {
-    return;
-  }
-  if (cvtWgtVal_) {
-    pipeline.push_back(*cvtWgtVal_);
-  }
-  // add bwdData handle
-  CHECK(wgtValBwdData_) << "Should have weight memory";
-  bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-void MKLDNNConvLayer::resetWgtValBwdData(
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& wgt) {
-  if (dataPD == nullptr) {
-    return;
-  }
-
-  // create new weight value for backward data, and create reorder if necessary
-  // since the primitive_desc would be different with wgtVal_
-  CHECK(wgtVal_) << "should have weight value";
-  if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
-    wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc());
-    cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
-    CHECK(cvtWgtVal_);
-  } else {
-    wgtValBwdData_ = wgtVal_;
-  }
-  VLOG(MKLDNN_FMTS) << "weight value format for backward data: "
-                    << wgtValBwdData_->getFormat();
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
deleted file mode 100644
index 2b164d0d3bc0e1446d7e4d82bb8a713195dbd927..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ /dev/null
@@ -1,477 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "MKLDNNBase.h"
-#include "mkldnn.hpp"
-#include "paddle/math/MKLDNNMatrix.h"
-#include "paddle/utils/Stat.h"
-
-DECLARE_bool(use_mkldnn);
-
-namespace paddle {
-
-class MKLDNNLayer;
-typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
-
-/**
- * @brief Base class of MKLDNNlayer.
- *
- */
-class MKLDNNLayer : public Layer {
- protected:
-  // batch size
-  int bs_;
-  // their sizes are always from the first input layer
-  // input image channel, height and width
-  int ic_, ih_, iw_;
-  // output image channel, height and width
-  int oc_, oh_, ow_;
-
-  // the condition that forward need be reset
-  size_t condition_;
-  // backward also need reset after reset forward handle
-  bool needResetBwd_;
-
-  // is output only mkldnn
-  bool outputOnlyMKLDNN_;
-
-  // mkldnn engine, stream and primivtives
-  mkldnn::engine engine_;
-  std::shared_ptr<MKLDNNStream> stream_;
-  std::shared_ptr<mkldnn::primitive> fwd_;
-  std::shared_ptr<mkldnn::primitive> bwdWgt_;
-  std::shared_ptr<mkldnn::primitive> bwdData_;
-  std::vector<mkldnn::primitive> pipelineFwd_;
-  std::vector<mkldnn::primitive> pipelineBwd_;
-
-  /* Value and grad are seperated as internal and external buffers.
-   * Each MKLDNNLayer must init or reset internal buffer at least,
-   * and the external buffer format is always nchw of nc(when h==w==1),
-   * which is the same format as paddle.
-   * The output_.value and output_.grad always save the external data,
-   * when mixed with cpu device.
-   * When all layers are mkldnn layers, they could save internal data.
-   */
-  // below MKLDNNMatrix buffers are all internal buffers
-  std::vector<MKLDNNMatrixPtr> inVals_;
-  std::vector<MKLDNNMatrixPtr> inGrads_;
-  MKLDNNMatrixPtr outVal_;
-  MKLDNNMatrixPtr outGrad_;
-  // below are external value and grad
-  std::vector<MKLDNNMatrixPtr> extInVals_;
-  std::vector<MKLDNNMatrixPtr> extInGrads_;
-  MKLDNNMatrixPtr extOutVal_;
-  MKLDNNMatrixPtr extOutGrad_;
-  // convert handle between external and internal buffers
-  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInVals_;
-  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInGrads_;
-  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
-  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
-
-  // weight and bias are always internal buffers
-  MKLDNNMatrixPtr wgtVal_;
-  MKLDNNMatrixPtr wgtGrad_;
-  MKLDNNMatrixPtr biasVal_;
-  MKLDNNMatrixPtr biasGrad_;
-
-  // merge grad primitive
-  std::shared_ptr<mkldnn::primitive> mergeGrad_;
-  std::vector<mkldnn::primitive> pipelineMergeGrad_;
-  // tmp input argument to save input grad, only used to merge grad
-  Argument tmpInArg_;
-
- public:
-  explicit MKLDNNLayer(const LayerConfig& config)
-      : Layer(config),
-        ih_(0),
-        iw_(0),
-        condition_(0),
-        needResetBwd_(true),
-        outputOnlyMKLDNN_(false),
-        engine_(mkldnn::engine::cpu, 0),
-        stream_(nullptr),
-        fwd_(nullptr),
-        bwdWgt_(nullptr),
-        bwdData_(nullptr) {}
-
-  ~MKLDNNLayer() {}
-
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
-
-  /**
-   * reshape the input and output channels and image sizes
-   * and reset output buffer size
-   */
-  virtual void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) = 0;
-
-  /**
-   * reset the mkldnn forward primitve and memories
-   * only would be called when input size changes
-   * weight and bias buffers should be coverd by child class itself
-   */
-  virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out) = 0;
-
-  /**
-   * reset the mkldnn backward primitve and memories
-   * only would be called when needed
-   * weight and bias buffers should be coverd by child class itself
-   */
-  virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out) = 0;
-
-  /**
-   * Update weights and biases if necessary.
-   */
-  virtual void updateWeights(const UpdateCallback& callback) {}
-
-  /**
-   * convert weight from paddle format to mkldnn format
-   * weight_ will be override
-   */
-  virtual void convertWeightsFromPaddle() {}
-
-  /**
-   * convert mkldnn weight to paddle format
-   * weight_ will be override
-   */
-  virtual void convertWeightsToPaddle() {}
-
-  /**
-   * add this interface as public for unit test
-   */
-  void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
-
- protected:
-  /**
-   * Some layers may have different condition to reset the forward.
-   * The function returns the condition that do not need reset forward.
-   */
-  inline virtual size_t keepCondition() {
-    // reset when the first input element size changed, not only the batchsize
-    return inputLayers_[0]->getOutputValue()->getElementCnt();
-  }
-
-  /**
-   * reshape the input image sizes and input batchsize
-   */
-  void reshapeInput(int& batchsize, int& height, int& width, size_t idx = 0);
-
-  /**
-   * reshape output image sizes
-   */
-  void reshapeOutput(size_t height, size_t width);
-
-  /**
-   * reset MKLDNNMatrix from Matrix and internal primitive desc.
-   * reset nullptr if matrix or primitive desc is empty
-   */
-  void resetWithMatrix(MKLDNNMatrixPtr& dnn,
-                       const MatrixPtr& mat,
-                       mkldnn::memory::primitive_desc pd);
-
-  /**
-   * reset input value from input MKLDNNMatrix and internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   * input channel may be different in concat.
-   */
-  void resetInValue(
-      MKLDNNMatrixPtr& in,
-      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
-      size_t idx = 0,
-      int inputChannel = 0);
-
-  /**
-   * reset output value from internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   */
-  void resetOutValue(MKLDNNMatrixPtr& out,
-                     mkldnn::memory::primitive_desc intPD);
-
-  /**
-   * reset input grad from internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   */
-  void resetInGrad(MKLDNNMatrixPtr& in,
-                   mkldnn::memory::primitive_desc intPD,
-                   size_t idx = 0);
-
-  /**
-   * reset output grad from internal primitive desc.
-   * merge grad if necessary.
-   * reset both internal and external buffer and create reorder if necessary.
-   * note: about merge grad, when this layer has several outputs,
-   *       it could not be mixed with cpu device,
-   *       since it can not get memory desc from cpu device.
-   */
-  void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
-
-  /**
-   * reset the merge grad primitive if necessary.
-   * note: do not support the grads mixed with cpu device,
-   *       since it can not get memory desc from cpu device.
-   */
-  void resetMergeGrad(MKLDNNMatrixPtr& out);
-
- protected:
-  /**
-   * Set deviceId of this layer.
-   */
-  void setDevice(int id) { deviceId_ = id; }
-
-  /**
-   * check the format is nchw or nc,
-   * which is supported by Paddle default memory layout
-   */
-  bool isPaddleFormat(mkldnn::memory::format fmt) {
-    if (fmt == mkldnn::memory::format::nchw ||
-        fmt == mkldnn::memory::format::nc) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  /**
-   * If input only has MKLDNN device.
-   * Otherwise, only support the previous layer using CPU device.
-   */
-  bool inputIsOnlyMKLDNN(int index = 0) {
-    int prevDevice = getPrev(index)->getDeviceId();
-    if (prevDevice == MKLDNN_DEVICE) {
-      return true;
-    } else {
-      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
-      return false;
-    }
-  }
-
-  /**
-   * If output only has MKLDNN device.
-   * Otherwise, other devices should only using CPU device.
-   */
-  bool outputIsOnlyMKLDNN() {
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
-          << "Only support other device is CPU yet";
-    }
-    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
-    return outputOnlyMKLDNN_;
-  }
-
-  /**
-   * print info about sizes
-   */
-  virtual void printSizeInfo() {
-    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
-                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
-                       << ", oh: " << oh_ << ", ow: " << ow_;
-  }
-
-  /**
-   * print the mkldnn memory format of value
-   */
-  virtual void printValueFormat() {
-    for (size_t i = 0; i < inVals_.size(); ++i) {
-      if (!inVals_[i]) {
-        continue;
-      }
-      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
-                        << ": " << (extInVals_[i] ? extInVals_[i]->getFormat()
-                                                  : inVals_[i]->getFormat())
-                        << " >>> " << inVals_[i]->getFormat() << " >>>";
-    }
-    if (outVal_) {
-      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "
-                        << (extOutVal_ ? extOutVal_->getFormat()
-                                       : outVal_->getFormat());
-    }
-    if (wgtVal_) {
-      VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
-    }
-    if (biasVal_) {
-      VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
-    }
-  }
-
-  /**
-   * print the mkldnn memory format of grad
-   */
-  virtual void printGradFormat() {
-    if (outGrad_) {
-      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "
-                        << (extOutGrad_ ? extOutGrad_->getFormat()
-                                        : outGrad_->getFormat());
-    }
-    for (size_t i = 0; i < inGrads_.size(); ++i) {
-      if (!inGrads_[i]) {
-        continue;
-      }
-      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
-                        << ": " << (extInGrads_[i] ? extInGrads_[i]->getFormat()
-                                                   : inGrads_[i]->getFormat())
-                        << " <<< " << inGrads_[i]->getFormat() << " <<<";
-    }
-    if (wgtGrad_) {
-      VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
-    }
-    if (biasGrad_) {
-      VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
-    }
-  }
-
- private:
-  /**
-   * clear all grad
-   */
-  void clearGrads() {
-    if (output_.grad) {
-      output_.grad->zeroMem();
-    }
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].grad) {
-        outputOtherDevice_[i].grad->zeroMem();
-      }
-    }
-  }
-
-  /**
-   * Set deviceId of the params used in this layer.
-   */
-  void setParamsDevice(int id, const ParameterMap& parameterMap) {
-    for (auto& inputConfig : config_.inputs()) {
-      if (inputConfig.has_input_parameter_name()) {
-        ParameterPtr parameter;
-        std::string name = inputConfig.input_parameter_name();
-        CHECK(mapGet(name, parameterMap, &parameter))
-            << "Cannot find input parameter " << name << " for layer "
-            << getName();
-        parameter->setDevice(id);
-      }
-    }
-    if (config_.has_bias_parameter_name()) {
-      ParameterPtr parameter;
-      std::string name = config_.bias_parameter_name();
-      CHECK(mapGet(name, parameterMap, &parameter))
-          << "Cannot find bias parameter " << name << " for layer "
-          << getName();
-      parameter->setDevice(id);
-    }
-  }
-
-  /**
-   * Set output map of prev layers.
-   */
-  void setOutputMap() {
-    outputMap_.clear();
-    for (size_t i = 0; i < inputLayers_.size(); ++i) {
-      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
-    }
-  }
-
-  /**
-   * if have cpu device, share value and grad data with output_
-   */
-  void shareCPUDevice() {
-    if (outputIsOnlyMKLDNN()) {
-      return;
-    }
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      outputOtherDevice_[i].value = output_.value;
-      outputOtherDevice_[i].grad = output_.grad;
-    }
-  }
-
-  /**
-   * Check the cpu device number of outputOtherDevice_.
-   * should have only one at most.
-   */
-  void checkCPUOutputsNumber(int max = 1) {
-    int cnt = 0;
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
-        ++cnt;
-      }
-    }
-    CHECK_LE(cnt, max) << "too much CPU devies";
-  }
-
-  /**
-   * copy SeqInfo from input layer to this output and other output devices.
-   * @note: do not use getInput(0) since it used this deviceId_,
-   *        use "inputLayers_[0]->getOutput()" instead.
-   */
-  void copySeqInfoToOutputs() {
-    if (inputLayers_.empty() || !needSequenceInfo_) {
-      return;
-    }
-    const Argument& input = inputLayers_[0]->getOutput();
-    output_.sequenceStartPositions = input.sequenceStartPositions;
-    output_.subSequenceStartPositions = input.subSequenceStartPositions;
-    output_.cpuSequenceDims = input.cpuSequenceDims;
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      outputOtherDevice_[i].sequenceStartPositions =
-          output_.sequenceStartPositions;
-      outputOtherDevice_[i].subSequenceStartPositions =
-          output_.subSequenceStartPositions;
-      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
-    }
-  }
-
-  void prepareValueConversions(std::vector<mkldnn::primitive>& pipeline) {
-    // MKLDNNLayer output value should be MKLDNNMatrix
-    // so external output value is necessary.
-    // Then external input value is not necessary,
-    // since input may be mkldnn internal buffer.
-    CHECK(extOutVal_) << "external output value is necessary";
-    output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
-    CHECK(inVals_[0] && outVal_) << "internal memories are necessary";
-    for (size_t i = 0; i < cvtInVals_.size(); ++i) {
-      if (cvtInVals_[i]) {
-        pipeline.insert(pipeline.begin(), *cvtInVals_[i]);
-      }
-    }
-    if (cvtOutVal_) {
-      pipeline.push_back(*cvtOutVal_);
-    }
-  }
-  void prepareGradConversions(std::vector<mkldnn::primitive>& pipeline) {
-    // external output grad is not necessary
-    // since output may be mkldnn internal buffer or merge them directly.
-    CHECK(outGrad_) << "internal output grad is necessary";
-    if (extOutGrad_) {
-      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
-          << "the external buffer should share the same data with output_.grad";
-    }
-    if (cvtOutGrad_) {
-      pipeline.insert(pipeline.begin(), *cvtOutGrad_);
-    }
-    for (size_t i = 0; i < cvtInGrads_.size(); ++i) {
-      if (cvtInGrads_[i]) {
-        pipeline.push_back(*cvtInGrads_[i]);
-      }
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
deleted file mode 100644
index 3be848c7496aac616903cb09844c5eadd320e91c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNPoolLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_pool, MKLDNNPoolLayer);
-
-bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  /* the size of inputs for pool-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-  const PoolConfig& conf = config_.inputs(0).pool_conf();
-  ic_ = conf.channels();
-  ih_ = conf.img_size_y();
-  iw_ = conf.img_size();
-  oc_ = ic_;
-  oh_ = conf.output_y();
-  ow_ = conf.output_x();
-  fh_ = conf.size_y();
-  fw_ = conf.size_x();
-  ph_ = conf.padding_y();
-  pw_ = conf.padding();
-  sh_ = conf.stride_y();
-  sw_ = conf.stride();
-
-  const std::string& type = conf.pool_type();
-  if (type == "max-projection") {
-    poolAlgo_ = algorithm::pooling_max;
-  } else if (type == "avg-projection") {
-    // paddle only use exclude_padding
-    poolAlgo_ = algorithm::pooling_avg_exclude_padding;
-  } else {
-    LOG(FATAL) << "unknow pooling type!";
-  }
-  return true;
-}
-
-void MKLDNNPoolLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-  // ic_ and oc can not be changed
-  CHECK_EQ((size_t)ic,
-           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
-      << "Input channel can not be changed";
-
-  // cal output sizes
-  // paddle used false caffeMode for pooling
-  oh = outputSize(ih, fh_, ph_, sh_, false);
-  ow = outputSize(iw, fw_, pw_, sw_, false);
-  reshapeOutput(oh, ow);
-
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs[0], out);
-
-  resetFwdPD(fwdPD_, inputs[0], out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
-}
-
-void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  std::shared_ptr<pool_bwd::primitive_desc> pd;
-
-  resetBwdBuffers(inputs[0], out);
-
-  resetBwdPD(pd, inputs[0], out);
-
-  resetBwdPipeline(pipeline, pd, inputs[0], out);
-}
-
-void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                      MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  CHECK(in);
-  auto outPD =
-      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
-  resetOutValue(out, outPD);
-}
-
-void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
-                                 MKLDNNMatrixPtr in,
-                                 MKLDNNMatrixPtr out) {
-  memory::dims kernels = memory::dims{fh_, fw_};
-  memory::dims strides = memory::dims{sh_, sw_};
-  memory::dims padL = memory::dims{ph_, pw_};
-  memory::dims padR = getPaddingR();
-  padding_kind padKind = padding_kind::zero;
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  auto fwdDesc = pool_fwd::desc(pk,
-                                poolAlgo_,
-                                in->getMemoryDesc(),
-                                out->getMemoryDesc(),
-                                strides,
-                                kernels,
-                                padL,
-                                padR,
-                                padKind);
-  pd.reset(new pool_fwd::primitive_desc(fwdDesc, engine_));
-
-  // prepare workspace if necessary
-  workspace_ =
-      (passType_ != PASS_TEST && poolAlgo_ == algorithm::pooling_max)
-          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
-          : nullptr;
-}
-
-void MKLDNNPoolLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<pool_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  fwd_ = workspace_
-             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
-             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                      MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-}
-
-void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
-                                 MKLDNNMatrixPtr& in,
-                                 MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  memory::dims kernels = memory::dims{fh_, fw_};
-  memory::dims strides = memory::dims{sh_, sw_};
-  memory::dims padL = memory::dims{ph_, pw_};
-  memory::dims padR = getPaddingR();
-  CHECK(out);
-  auto bwdDesc = pool_bwd::desc(poolAlgo_,
-                                in->getMemoryDesc(),
-                                out->getMemoryDesc(),
-                                strides,
-                                kernels,
-                                padL,
-                                padR,
-                                padding_kind::zero);
-  pd.reset(new pool_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNPoolLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<pool_bwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  if (pd == nullptr) {
-    return;
-  }
-
-  bwdData_ =
-      workspace_
-          ? std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *workspace_, *in))
-          : std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h
deleted file mode 100644
index b01a961d007a0e2e343db7b51e50fd3ee776435e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLPackedWeight.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/MathFunctions.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/Weight.h"
-
-namespace paddle {
-
-class MKLPackedWeight {
- protected:
-  /// The pointer of weight
-  real *weight_;
-  /// The pointer of cblas packed gemm to weight
-  real *packedWeight_;
-  size_t height_;
-  size_t width_;
-  bool transW_;
-
- public:
-  explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
-    packedWeight_ = nullptr;
-    weight_ = weight->getData();
-    height_ = weight->getHeight();
-    width_ = weight->getWidth();
-    transW_ = transW;
-  }
-
-  ~MKLPackedWeight() { free_(); }
-
-  void pack() { pack_(weight_); }
-
-  void gemm_compute(const MatrixPtr src, MatrixPtr dst) {
-    cblas_sgemm_compute(CblasRowMajor,
-                        CblasNoTrans,
-                        CblasPacked,
-                        src->getHeight(),
-                        transW_ ? height_ : width_,
-                        transW_ ? width_ : height_,
-                        src->getData(),
-                        src->getWidth(),
-                        packedWeight_,
-                        width_,
-                        1.0,
-                        dst->getData(),
-                        dst->getWidth());
-  }
-
- protected:
-  void pack_(real *src) {
-    if (!packedWeight_) {
-      packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
-    }
-    cblas_sgemm_pack(CblasRowMajor,
-                     CblasBMatrix,
-                     transW_ ? CblasTrans : CblasNoTrans,
-                     1,
-                     transW_ ? height_ : width_,
-                     transW_ ? width_ : height_,
-                     1.0,
-                     src,
-                     width_,
-                     packedWeight_);
-  }
-
-  void free_() {
-    if (packedWeight_) {
-      cblas_sgemm_free(packedWeight_);
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
deleted file mode 100644
index e46f997c342ce5d6b724629dff6950c4f1680ce8..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MaxLayer.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "SequencePoolLayer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * A layer for "internal max" for sequence input.
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = max_{for each instance in this sequence}{input[i]}
- *    If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and the max pooling operation is
- *              then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-
-class MaxLayer : public SequencePoolLayer {
- protected:
-  // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
-  IVectorPtr maxIndex_;
-
- public:
-  explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    return SequencePoolLayer::init(layerMap, parameterMap);
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxOutLayer.h b/paddle/gserver/layers/MaxOutLayer.h
deleted file mode 100644
index 0eb8674b4c4f3f58b103c6b59ad13931a6992a1b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MaxOutLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer to do max out on conv layer output.
- * Input: output of a conv layer.
- * Output: feature map size same as input.  Channel is (input channel) / groups.
- * So the num of channels should be able to devided by groups.
- *
- * The config file api is maxout_layer.
- */
-
-class MaxOutLayer : public Layer {
- protected:
-  size_t groups_;
-  size_t imgSizeH_, imgSizeW_;
-  /// outputChannels_ = channels_ / groups_
-  size_t channels_, outputChannels_;
-  /// feature length = imgSizeH_ * imgSizeW_
-  size_t featLen_;
-  IVectorPtr maxoutId_;
-
- public:
-  /// return imgSizeH_ * imgSizeW_ * outputChannels_;
-  size_t getSize();
-
-  explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
-  virtual ~MaxOutLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.h b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
deleted file mode 100644
index c948364f6b83b0de1ee07cc185b69346f5cb1a7e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MaxPoolWithMaskLayer.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "PoolLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief Basic parent layer of different kinds of pooling
- */
-class MaxPoolWithMaskLayer : public PoolLayer {
- protected:
-  Argument mask_;
-
- public:
-  explicit MaxPoolWithMaskLayer(const LayerConfig& config)
-      : PoolLayer(config) {}
-
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
deleted file mode 100644
index 43ecc48cd97fb54d8dc4eb1d87ebf60f5aa040d8..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- *@brief This layer multiplex multiple layers according to the index,
- * which is provided by the first input layer.
- * - Input[0]: the index of the layer to output of size batchSize.
- * - Input[1:N]; the candidate output data.
- * For each index i from 0 to batchSize -1, the output is the i-th row of the
- * (index[i] + 1)-th layer.
- *
- * For each i-th row of output:
- *
- * \f[
- *   y[i][j] = x_{x_{0}[i] + 1}[i][j], j = 0,1, ... , (x_{1}.width - 1)
- * \f]
- * where, y is output. \f$x_{k}\f$ is the k-th input layer and
- * \f$k = x_{0}[i] + 1\f$.
- */
-
-class MultiplexLayer : public Layer {
- protected:
-  /**
-   * @brief A struct is used to save the copy information, includes input
-   * layer index and copy size.
-   */
-  struct CopyInfo {
-    CopyInfo(int inStartIdx, int inLength, int inCopyIdx)
-        : startIdx(inStartIdx), length(inLength), copyIdx(inCopyIdx) {}
-
-    /// The start row of input.
-    int startIdx;
-    /// Number of rows. If the layer index in Input[0] is not consecutive,
-    /// the length is one. Otherwise, the length is > 1 and copy multi rows
-    /// once.
-    int length;
-    /// The copied layer index, which needs to add 1.
-    int copyIdx;
-  };
-
-  /// A list of CopyInfo used to save copy information.
-  std::vector<CopyInfo> copySchedule_;
-
-  /// Temporary matrix pointer to point to input data.
-  MatrixPtr tmpSrc_;
-  /// Temporary matrix pointer to point to output data.
-  MatrixPtr tmpDest_;
-
- public:
-  explicit MultiplexLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~MultiplexLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /**
-   * @brief Calculate copy info for input layers.
-   */
-  void calculateCopySchedule(const IVectorPtr& copyIds, size_t numIns);
-};
-
-REGISTER_LAYER(multiplex, MultiplexLayer);
-
-void MultiplexLayer::calculateCopySchedule(const IVectorPtr& copyIds,
-                                           size_t numIns) {
-  copySchedule_.clear();
-  CopyInfo prevCopyInfo(0, 0, -1);
-  for (size_t i = 0; i < copyIds->getSize(); i++) {
-    int copyId = copyIds->getElement(i);
-    CHECK_GE(copyId, 0);
-    CHECK_LT(copyId, int(numIns));
-    // copy same input layer with prevous and will copy consecutive.
-    if (copyId == prevCopyInfo.copyIdx) {
-      ++prevCopyInfo.length;
-    } else {
-      if (prevCopyInfo.copyIdx != -1) {
-        copySchedule_.emplace_back(prevCopyInfo);
-      }
-      prevCopyInfo.startIdx = i;
-      prevCopyInfo.length = 1;
-      prevCopyInfo.copyIdx = copyId;
-    }
-  }
-  if (prevCopyInfo.copyIdx != -1) {
-    copySchedule_.emplace_back(prevCopyInfo);
-  }
-}
-
-bool MultiplexLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_GE(inputLayers_.size(), 2U);
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  return true;
-}
-
-void MultiplexLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  IVectorPtr copyIds = getInput(0).ids;
-  MatrixPtr inV1 = getInputValue(1);
-  CHECK_EQ(copyIds->getSize(), inV1->getHeight());
-  for (size_t i = 2; i < inputLayers_.size(); i++) {
-    CHECK_EQ(inV1->getHeight(), getInputValue(i)->getHeight());
-    CHECK_EQ(inV1->getWidth(), getInputValue(i)->getWidth());
-  }
-
-  calculateCopySchedule(copyIds, inputLayers_.size() - 1);
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(inV1->getHeight(), inV1->getWidth());
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwLMultplexingTimer", getName().c_str());
-    AsyncGpuBlock block;
-    for (const CopyInfo& info : copySchedule_) {
-      outV->subMatrix(info.startIdx, info.length, tmpDest_)
-          ->copyFrom(*getInputValue(info.copyIdx + 1)
-                          ->subMatrix(info.startIdx, info.length, tmpSrc_));
-    }
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void MultiplexLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  MatrixPtr outG = getOutputGrad();
-
-  {
-    REGISTER_TIMER_INFO("BwLMultiplexTimer", getName().c_str());
-    AsyncGpuBlock block;
-    for (const CopyInfo& info : copySchedule_) {
-      if (getInputGrad(info.copyIdx + 1)) {
-        getInputGrad(info.copyIdx + 1)
-            ->subMatrix(info.startIdx, info.length, tmpDest_)
-            ->add(*outG->subMatrix(info.startIdx, info.length, tmpSrc_));
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
deleted file mode 100644
index cc48fe100f12446f9522078119ae2ead039a82cc..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/NCELayer.cpp
+++ /dev/null
@@ -1,323 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include "Layer.h"
-#include "MultinomialSampler.h"
-#include "paddle/math/MathFunctions.h"
-
-namespace paddle {
-
-/**
- * Noise-contrastive estimation.
- * Implements the method in the following paper:
- * A fast and simple algorithm for training neural probabilistic language
- * models.
- *
- * The config file api is nce_layer.
- */
-class NCELayer : public Layer {
-  int numClasses_;
-  /// number of input layer besides labelLayer and weightLayer
-  int numInputs_;
-  LayerPtr labelLayer_;
-  /// weight layer, can be None
-  LayerPtr weightLayer_;
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-  std::unique_ptr<MultinomialSampler> sampler_;
-
-  std::uniform_int_distribution<int> rand_;
-
-  struct Sample {
-    int sampleId;
-    int labelId;
-    bool target;
-    real weight;
-  };
-  std::vector<Sample> samples_;
-  /// whether samples_ is prepared
-  bool prepared_;
-  Argument sampleOut_;
-
-  IVectorPtr labelIds_;
-
- public:
-  explicit NCELayer(const LayerConfig& config)
-      : Layer(config),
-        numClasses_(config.num_classes()),
-        rand_(0, config.num_classes() - 1),
-        prepared_(false) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    /* Initialize the basic parent class */
-    Layer::init(layerMap, parameterMap);
-
-    /* initialize the weightList */
-    size_t i;
-    for (i = 0; i < inputLayers_.size(); i++) {
-      if (!parameters_[i]) break;
-      size_t width = inputLayers_[i]->getSize();
-      // create a new weight
-      CHECK_EQ(parameters_[i]->getSize(), width * numClasses_);
-      Weight* w = new Weight(numClasses_, width, parameters_[i]);
-
-      // append the new weight to the list
-      weights_.emplace_back(w);
-    }
-
-    CHECK_EQ(1U, getSize());
-
-    numInputs_ = i;
-    CHECK_GE(numInputs_, 1)
-        << "Must have at least one input besides label and weight";
-    CHECK_LT(i, inputLayers_.size()) << "Missing label layer";
-    labelLayer_ = inputLayers_[i];
-    if (++i < inputLayers_.size()) {
-      weightLayer_ = inputLayers_[i];
-      ++i;
-    }
-    CHECK_EQ(i, inputLayers_.size());
-
-    /* initialize biases_ */
-    if (biasParameter_.get() != NULL) {
-      CHECK_EQ(biasParameter_->getSize(), (size_t)numClasses_);
-      biases_.reset(new Weight(1, numClasses_, biasParameter_));
-    }
-
-    if (config_.neg_sampling_dist_size()) {
-      CHECK_EQ(numClasses_, config_.neg_sampling_dist_size());
-      sampler_.reset(MultinomialSampler::create(
-          config_.neg_sampling_dist().data(), numClasses_));
-    }
-
-    return true;
-  }
-
-  void prepareSamples() {
-    CHECK(!useGpu_) << "GPU is not supported";
-
-    int batchSize = getInput(*labelLayer_).getBatchSize();
-    IVectorPtr label = getInput(*labelLayer_).ids;
-
-    CpuSparseMatrixPtr multiLabel = std::dynamic_pointer_cast<CpuSparseMatrix>(
-        getInput(*labelLayer_).value);
-
-    CHECK(label || multiLabel)
-        << "The label layer must have ids or NonValueSparseMatrix value";
-
-    auto& randEngine = ThreadLocalRandomEngine::get();
-
-    samples_.clear();
-    samples_.reserve(batchSize * (1 + config_.num_neg_samples()));
-
-    real* weight =
-        weightLayer_ ? getInputValue(*weightLayer_)->getData() : nullptr;
-
-    for (int i = 0; i < batchSize; ++i) {
-      real w = weight ? weight[i] : 1;
-      if (label) {
-        int* ids = label->getData();
-        samples_.push_back({i, ids[i], true, w});
-      } else {
-        const int* cols = multiLabel->getRowCols(i);
-        int n = multiLabel->getColNum(i);
-        for (int j = 0; j < n; ++j) {
-          samples_.push_back({i, cols[j], true, w});
-        }
-      }
-      for (int j = 0; j < config_.num_neg_samples(); ++j) {
-        int id = sampler_ ? sampler_->gen(randEngine) : rand_(randEngine);
-        samples_.push_back({i, id, false, w});
-      }
-    }
-    prepared_ = true;
-  }
-
-  void prefetch() override {
-    prepareSamples();
-    IVector::resizeOrCreate(labelIds_, samples_.size(), useGpu_);
-    int* ids = labelIds_->getData();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      ids[i] = samples_[i].labelId;
-    }
-
-    for (int i = 0; i < numInputs_; ++i) {
-      auto sparseParam =
-          dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
-      if (sparseParam) {
-        sparseParam->addRows(labelIds_);
-      }
-    }
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-
-    CHECK(!useGpu_) << "GPU is not supported";
-
-    if (!prepared_) {
-      if (passType == PASS_GC) {
-        ThreadLocalRandomEngine::get().seed(ThreadLocalRand::getDefaultSeed());
-      }
-      prepareSamples();
-    }
-    prepared_ = false;
-
-    /* malloc memory for the output_ if necessary */
-    int batchSize = getInputValue(0)->getHeight();
-    int size = getSize();
-    resetOutput(batchSize, size);
-
-    Matrix::resizeOrCreate(sampleOut_.value,
-                           1,
-                           samples_.size(),
-                           /* trans= */ false,
-                           useGpu_);
-
-    forwardBias();
-
-    for (int l = 0; l < numInputs_; ++l) {
-      forwardOneInput(l);
-    }
-
-    auto status = activation_->forward(sampleOut_);
-    status.check();
-
-    forwardCost();
-  }
-
-  void backward(const UpdateCallback& callback) override {
-    Matrix::resizeOrCreate(sampleOut_.grad,
-                           1,
-                           samples_.size(),
-                           /* trans= */ false,
-                           useGpu_);
-
-    backwardCost();
-
-    auto status = activation_->backward(sampleOut_);
-    status.check();
-
-    if (biases_->getWGrad()) {
-      backwardBias(callback);
-    }
-
-    for (int l = 0; l < numInputs_; ++l) {
-      backwardOneInput(l, callback);
-    }
-  }
-
-  void forwardBias() {
-    if (!biases_) {
-      sampleOut_.value->zeroMem();
-    } else {
-      real* bias = biases_->getW()->getData();
-      real* sampleOut = sampleOut_.value->getData();
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        sampleOut[i] = bias[samples_[i].labelId];
-      }
-    }
-  }
-
-  void backwardBias(const UpdateCallback& callback) {
-    if (!biases_) return;
-    real* bias = biases_->getWGrad()->getData();
-    real* sampleOut = sampleOut_.grad->getData();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      bias[samples_[i].labelId] += sampleOut[i];
-    }
-    biases_->incUpdate(callback);
-  }
-
-  void forwardOneInput(int layerId) {
-    const MatrixPtr& inputMat = getInputValue(layerId);
-    const MatrixPtr& weightMat = weights_[layerId]->getW();
-
-    int dim = inputMat->getWidth();
-    real* sampleOut = sampleOut_.value->getData();
-
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      sampleOut[i] += dotProduct(dim,
-                                 inputMat->getRowBuf(samples_[i].sampleId),
-                                 weightMat->getRowBuf(samples_[i].labelId));
-    }
-  }
-
-  void backwardOneInput(int layerId, const UpdateCallback& callback) {
-    const MatrixPtr& inputMat = getInputValue(layerId);
-    const MatrixPtr& inputGradMat = getInputGrad(layerId);
-    const MatrixPtr& weightMat = weights_[layerId]->getW();
-    const MatrixPtr& weightGradMat = weights_[layerId]->getWGrad();
-
-    int dim = inputMat->getWidth();
-    real* sampleGrad = sampleOut_.grad->getData();
-
-    if (weightGradMat) {
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim,
-             sampleGrad[i],
-             inputMat->getRowBuf(samples_[i].sampleId),
-             weightGradMat->getRowBuf(samples_[i].labelId));
-      }
-      weights_[layerId]->incUpdate(callback);
-    }
-
-    if (inputGradMat) {
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim,
-             sampleGrad[i],
-             weightMat->getRowBuf(samples_[i].labelId),
-             inputGradMat->getRowBuf(samples_[i].sampleId));
-      }
-    }
-  }
-
-  void forwardCost() {
-    real* out = output_.value->getData();
-    real* sampleOut = sampleOut_.value->getData();
-    real b = 1. / numClasses_ * config_.num_neg_samples();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      real o = sampleOut[i];
-      if (sampler_) {
-        b = config_.num_neg_samples() *
-            config_.neg_sampling_dist(samples_[i].labelId);
-      }
-      real cost = samples_[i].target ? -log(o / (o + b)) : -log(b / (o + b));
-      out[samples_[i].sampleId] += samples_[i].weight * cost;
-    }
-  }
-
-  void backwardCost() {
-    real* sampleOut = sampleOut_.value->getData();
-    real* sampleGrad = sampleOut_.grad->getData();
-
-    real b = 1. / numClasses_ * config_.num_neg_samples();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      real o = sampleOut[i];
-      if (sampler_) {
-        b = config_.num_neg_samples() *
-            config_.neg_sampling_dist(samples_[i].labelId);
-      }
-      real w = samples_[i].weight;
-      sampleGrad[i] = samples_[i].target ? -w * b / (o * (o + b)) : w / (o + b);
-    }
-  }
-};
-
-REGISTER_LAYER(nce, NCELayer);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
deleted file mode 100644
index 3807584415f99a7110170748501589dac85eac52..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/NormLayer.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "NormLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of normalization
- *
- * @note Normalize the input in local region
- */
-class NormLayer : public Layer {
- public:
-  explicit NormLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    Layer::init(layerMap, parameterMap);
-    return true;
-  }
-
-  /**
-   * @brief create norm layer by norm_type
-   */
-  static Layer* create(const LayerConfig& config);
-};
-
-/**
- * @brief response normalization within feature maps
- * namely normalize in independent channel
- * When code refactoring, we delete the original implementation.
- * Need to implement in the futrue.
- */
-class ResponseNormLayer : public NormLayer {
- protected:
-  size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
-  real scale_, pow_;
-  MatrixPtr denoms_;
-
- public:
-  explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override { LOG(FATAL) << "Not implemented"; }
-  void backward(const UpdateCallback& callback = nullptr) override {
-    LOG(FATAL) << "Not implemented";
-  }
-};
-
-/**
- * This layer applys normalization across the channels of each sample to a
- * conv layer's output, and scales the output by a group of trainable factors
- * whose dimensions equal to the number of channels.
- * - Input: One and only one input layer are accepted.
- * - Output: The normalized data of the input data.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
-class CrossChannelNormLayer : public NormLayer {
- public:
-  explicit CrossChannelNormLayer(const LayerConfig& config)
-      : NormLayer(config) {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
-  MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
-  MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
-
- protected:
-  size_t channels_;
-  std::unique_ptr<Weight> scale_;
-  MatrixPtr scaleDiff_;
-  MatrixPtr normBuffer_;
-  MatrixPtr dataBuffer_;
-  MatrixPtr channelBuffer_;
-  MatrixPtr spatialBuffer_;
-  MatrixPtr sampleBuffer_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
deleted file mode 100644
index 64803a1603599f2e393ec772a32d64f4d271fe71..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "NormLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief response normalization across feature maps
- * namely normalize in number of size_ channels
- */
-class CMRProjectionNormLayer : public ResponseNormLayer {
-  size_t imgSizeH_, imgSizeW_;
-  size_t outputH_, outputW_;
-
- public:
-  explicit CMRProjectionNormLayer(const LayerConfig& config)
-      : ResponseNormLayer(config) {}
-
-  ~CMRProjectionNormLayer() {}
-
-  size_t getSize();
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  TensorShape shape_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Operator.h b/paddle/gserver/layers/Operator.h
deleted file mode 100644
index 42d525ef3e4534acea7512d5ecdbe8a0e1d110d9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Operator.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ModelConfig.pb.h"
-#include "paddle/parameter/Parameter.h"
-
-#include "Layer.h"
-#include "paddle/parameter/Argument.h"
-
-namespace paddle {
-
-// Macro for registering a operator type
-// Example: REGISTER_OPERATOR(dot_mul, DotMulOperator);
-#define REGISTER_OPERATOR(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {               \
-    Operator::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-/**
- * Operator like Projection, but takes more than one Arguments as input.
- * @note: Operator can't have parameters.
- */
-class Operator {
- public:
-  static Operator* create(const OperatorConfig& config, bool useGpu);
-
-  Operator(const OperatorConfig& config, bool useGpu)
-      : config_(config), useGpu_(useGpu) {}
-
-  virtual ~Operator() {}
-
-  const OperatorConfig& getConfig() const { return config_; }
-
-  static ClassRegistrar<Operator, OperatorConfig, bool> registrar_;
-
-  /**
-   * Forward propagation. If backward() will be called, in and out must be kept
-   * valid until then.
-   * @param ins inputs of operator
-   * @param out output of operator
-   * @param passType PASS_TRAIN of PASS_TEST
-   */
-  void forward(std::vector<const Argument*> ins,
-               Argument* out,
-               PassType passType) {
-    ins_ = ins;
-    out_ = out;
-    passType_ = passType;
-    forward();
-  }
-
-  virtual void prefetch(const Argument* in) {}
-  virtual void forward() = 0;
-  virtual void backward() = 0;
-
-  /**
-   * See comment in Layer.h for the function with the same name.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Set layer state.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
- protected:
-  /// Config of operator
-  OperatorConfig config_;
-  bool useGpu_;
-
-  /// Store `ins` passed to forward()
-  std::vector<const Argument*> ins_;
-  /// Store `out` passed to forward()
-  Argument* out_;
-  /// Store `passType` passed to forward()
-  PassType passType_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
deleted file mode 100644
index 11a910f3316114b309efe9007a156e842b3d6229..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for computing the outer product of two vectors
- * @note used in NEURAL TURING MACHINE
- * Input1: vector (batchSize * dim1)
- * Input2: vector (batchSize * dim2)
- * Output: a matrix: (batchSize * (dim1*dim2))
- */
-
-class OuterProdLayer : public Layer {
- protected:
-  MatrixPtr tmpMtx0;
-  MatrixPtr tmpRow0;
-  MatrixPtr tmpRow1;
-
- public:
-  explicit OuterProdLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~OuterProdLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(out_prod, OuterProdLayer);
-
-bool OuterProdLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  size_t dim0 = inputLayers_[0]->getSize();
-  size_t dim1 = inputLayers_[1]->getSize();
-
-  CHECK_EQ(dim0 * dim1, getSize()) << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(
-      nullptr, /* height= */ 1, dim0, /* trans= */ false, useGpu_);
-  tmpRow1 = Matrix::create(
-      nullptr, /* height= */ 1, dim1, /* trans= */ false, useGpu_);
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ dim0,
-                           dim1,
-                           /* trans= */ false,
-                           useGpu_);
-  return true;
-}
-
-void OuterProdLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dim0 = inV0->getWidth();
-  size_t dim1 = inV1->getWidth();
-
-  CHECK_EQ(dim0 * dim1, getSize());
-  CHECK_EQ(inV1->getHeight(), batchSize);
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dim0 * dim1);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  {
-    REGISTER_TIMER_INFO("FwOutProdTimer", getName().c_str());
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpMtx0->setData(outV->getData() + i * dim0 * dim1);
-      tmpRow0->setData(inV0->getData() + i * dim0);
-      tmpRow1->setData(inV1->getData() + i * dim1);
-
-      tmpMtx0->mul(*tmpRow0->getTranspose(), *tmpRow1);
-    }
-  }
-}
-
-void OuterProdLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dim0 = inV0->getWidth();
-  size_t dim1 = inV1->getWidth();
-
-  {
-    REGISTER_TIMER_INFO("BwOutProdTimer", getName().c_str());
-
-    if (inG0) {
-      for (size_t i = 0; i < batchSize; i++) {
-        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
-        tmpRow0->setData(inG0->getData() + i * dim0);
-        tmpRow1->setData(inV1->getData() + i * dim1);
-
-        tmpRow0->mul(*tmpRow1, *tmpMtx0->getTranspose(), 1, 1);
-      }
-    }
-
-    if (inG1) {
-      for (size_t i = 0; i < batchSize; i++) {
-        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
-        tmpRow0->setData(inV0->getData() + i * dim0);
-        tmpRow1->setData(inG1->getData() + i * dim1);
-
-        tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 1);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h
deleted file mode 100644
index 4553413fcdbecbc83e1f50e8ffbe874fdf05d828..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ParameterReluLayer.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- *  @brief ParameterReluLayer active inputs with learnable parameter weight_.
- *  forward:
- *  \f[
- *      y = x > 0 ? x : w .* x
- *  \f]
- *  backward:
- *  \f[
- *      dx = x > 0 ? dy : w .* dy \\
- *      dw = x > 0 ? 0 : dy.*x
- *  \f]
- *  Here, x is the input, w is the weight, y is the output.
- *  dx, dw, dy is the gradient.
- */
-
-class ParameterReluLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> weight_;
-
-  /**
-   *  @brief partialSum_ makes a group of inputs share same weights,
-   *  - partialSum_ = 1:
-   *       element wise activation: each element has a weight_,
-   *  - partialSum_ = number of elements in one channel,
-   *       channels wise parameter activation, elements in a channel
-   *       share same weight_,
-   *  - partialSum_ = number of outputs
-   *       all elements share same weight_,
-   */
-  size_t partialSum_;
-
- public:
-  explicit ParameterReluLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ParameterReluLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Pool3DLayer.h b/paddle/gserver/layers/Pool3DLayer.h
deleted file mode 100644
index 32605f8b7028cfb4909c885e83017a8cffa79575..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Pool3DLayer.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of pooling
- * Pools the input within regions
- */
-class Pool3DLayer : public Layer {
- public:
-  explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
-  ~Pool3DLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-  size_t getSize();
-
- protected:
-  int channels_;
-  int sizeX_, sizeY_, sizeZ_;
-  int strideW_, strideH_, strideD_;
-  int paddingW_, paddingH_, paddingD_;
-  int imgSizeW_, imgSizeH_, imgSizeD_;
-  int outputW_, outputH_, outputD_;
-  std::string poolType_;
-  MatrixPtr maxPoolIdx_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
deleted file mode 100644
index 99f8f148e2eb00f7e431e7d8c5acbf9e27574017..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PoolLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of pooling
- * Pools the input within regions
- */
-class PoolLayer : public Layer {
- protected:
-  size_t channels_, sizeX_, stride_, outputX_, imgSize_;
-  int confPadding_;
-
-  size_t sizeY_;
-  size_t imgSizeY_;
-  size_t strideY_;
-  size_t outputY_;
-  int confPaddingY_;
-
-  std::string poolType_;
-
-  bool excludeMode_;
-
- public:
-  explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  /**
-   * @brief create pooling layer by pool_type
-   */
-  static Layer* create(const LayerConfig& config);
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
deleted file mode 100644
index 8004cc1550337160b7f022c97a23ed8eb9d43ca4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PoolProjection.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-#include "paddle/math/MathUtils.h"
-
-namespace paddle {
-
-class PoolProjection : public Projection {
- protected:
-  size_t imgSizeY_, imgSize_;
-  size_t outputY_, outputX_;
-  size_t strideY_, stride_;
-  size_t sizeY_, sizeX_;
-  int confPaddingY_, confPadding_;
-  size_t channels_;
-  std::string poolType_;
-  bool excludeMode_;
-
- public:
-  PoolProjection(const ProjectionConfig& config,
-                 ParameterPtr parameter,
-                 bool useGpu);
-
-  static PoolProjection* create(const ProjectionConfig& config,
-                                ParameterPtr parameter,
-                                bool useGpu);
-
-  const std::string& getPoolType() const { return poolType_; }
-
-  size_t getSize();
-};
-
-class MaxPoolProjection : public PoolProjection {
- public:
-  MaxPoolProjection(const ProjectionConfig& config,
-                    ParameterPtr parameter,
-                    bool useGpu)
-      : PoolProjection(config, parameter, useGpu) {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback = nullptr);
-};
-
-class AvgPoolProjection : public PoolProjection {
- public:
-  AvgPoolProjection(const ProjectionConfig& config,
-                    ParameterPtr parameter,
-                    bool useGpu)
-      : PoolProjection(config, parameter, useGpu) {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback = nullptr);
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.h b/paddle/gserver/layers/PoolProjectionLayer.h
deleted file mode 100644
index 9ad144cc2ad426caa522bf1061a750d47e64a755..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PoolProjectionLayer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "PoolLayer.h"
-#include "PoolProjection.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief Basic parent layer of different kinds of pooling
- */
-class PoolProjectionLayer : public PoolLayer {
- protected:
-  size_t imgSizeH_, imgSizeW_;
-  size_t outputH_, outputW_;
-  std::unique_ptr<PoolProjection> poolProjection_;
-  ProjectionConfig projectionConfig_;
-
- public:
-  explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {
-    PoolConfig* conf = projectionConfig_.mutable_pool_conf();
-    *conf = config_.inputs(0).pool_conf();
-    poolProjection_.reset(
-        PoolProjection::create(projectionConfig_, nullptr, useGpu_));
-  }
-
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
deleted file mode 100644
index 7e8d60db8fe588026c6040099745c3aefd7237b5..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * This layer applys a power function to a vector element-wise,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y = x^w
- * \f]
- * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight,
- * and output \f$y\f$ is a vector.
- *
- * The config file api is power_layer.
- */
-
-class PowerLayer : public Layer {
- protected:
-  MatrixPtr tmpMtx;
-
- public:
-  explicit PowerLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~PowerLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(power, PowerLayer);
-
-bool PowerLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void PowerLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(getSize(), dataDim);
-  CHECK_EQ(1U, inV0->getWidth());
-  CHECK_EQ(batchSize, inV0->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  {
-    REGISTER_TIMER_INFO("FwPowerTimer", getName().c_str());
-    outV->rowPow(0, *inV1, *inV0);
-  }
-}
-
-void PowerLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  {
-    REGISTER_TIMER_INFO("BwPowerTimer", getName().c_str());
-    Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
-
-    if (inG0) {
-      tmpMtx->log2(*inV1);
-      tmpMtx->dotMul(*tmpMtx, *outV);
-
-      // inG0 += outG .* (log(inV1) * outV)
-      inG0->rowDotMul(0, *outG, *tmpMtx);
-    }
-
-    if (inG1) {
-      // tmp = (outV / inV1) * inV0
-      tmpMtx->dotDiv(*outV, *inV1);
-      tmpMtx->rowScale(0, *tmpMtx, *inV0);
-
-      inG1->addDotMul(*outG, *tmpMtx, 1, 1);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
deleted file mode 100644
index 39d2c2d737fa90737635efdb209610e156c8662f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PriorBox.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief A layer for generating priorbox locations and variances.
- * - Input: Two and only two input layer are accepted. The input layer must be
- *          be a data output layer and a convolution output layer.
- * - Output: The priorbox locations and variances of the input data.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
-
-class PriorBoxLayer : public Layer {
- public:  // NOLINT
-  explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override {}
-
- protected:  // NOLINT
-  int numPriors_;
-  std::vector<int> minSize_;
-  std::vector<int> maxSize_;
-  std::vector<real> aspectRatio_;
-  std::vector<real> variance_;
-  MatrixPtr buffer_;
-};
-
-REGISTER_LAYER(priorbox, PriorBoxLayer);
-
-bool PriorBoxLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  auto pbConf = config_.inputs(0).priorbox_conf();
-  std::vector<real> tmp;
-  aspectRatio_.push_back(1.);
-  std::copy(pbConf.min_size().begin(),
-            pbConf.min_size().end(),
-            std::back_inserter(minSize_));
-  std::copy(pbConf.max_size().begin(),
-            pbConf.max_size().end(),
-            std::back_inserter(maxSize_));
-  std::copy(pbConf.variance().begin(),
-            pbConf.variance().end(),
-            std::back_inserter(variance_));
-  std::copy(pbConf.aspect_ratio().begin(),
-            pbConf.aspect_ratio().end(),
-            std::back_inserter(tmp));
-
-  if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());
-
-  // flip aspect ratios
-  for (unsigned index = 0; index < tmp.size(); index++) {
-    real ar = tmp[index];
-    if (fabs(ar - 1.) < 1e-6) continue;
-    aspectRatio_.push_back(ar);
-    aspectRatio_.push_back(1. / ar);
-  }
-
-  numPriors_ = aspectRatio_.size() * minSize_.size() + maxSize_.size();
-
-  return true;
-}
-
-void PriorBoxLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  auto input = getInput(0);
-  int layerWidth = input.getFrameWidth();
-  int layerHeight = input.getFrameHeight();
-
-  auto image = getInput(1);
-  int imageWidth = image.getFrameWidth();
-  int imageHeight = image.getFrameHeight();
-
-  real stepW = static_cast<real>(imageWidth) / layerWidth;
-  real stepH = static_cast<real>(imageHeight) / layerHeight;
-  int dim = layerHeight * layerWidth * numPriors_ * 4;
-  reserveOutput(1, dim * 2);
-  // use a cpu buffer to compute
-  Matrix::resizeOrCreate(buffer_, 1, dim * 2, false, false);
-  auto* tmpPtr = buffer_->getData();
-
-  int idx = 0;
-  for (int h = 0; h < layerHeight; ++h) {
-    for (int w = 0; w < layerWidth; ++w) {
-      real centerX = (w + 0.5) * stepW;
-      real centerY = (h + 0.5) * stepH;
-      for (size_t s = 0; s < minSize_.size(); s++) {
-        real minSize = minSize_[s];
-        real boxWidth = minSize;
-        real boxHeight = minSize;
-
-        // first prior: aspect_ratio == 1.0, compatible to old logic
-        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-        tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-        tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-        // set the variance.
-        for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-
-        if (maxSize_.size() > 0) {
-          // square prior with size sqrt(minSize * maxSize)
-          real maxSize = maxSize_[s];
-          boxWidth = boxHeight = sqrt(minSize * maxSize);
-          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-          // set the variance.
-          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-        }
-
-        // priors with different aspect ratios
-        for (size_t r = 0; r < aspectRatio_.size(); r++) {
-          real ar = aspectRatio_[r];
-          if (fabs(ar - 1.0) < 1e-6) {
-            continue;
-          }
-          boxWidth = minSize * sqrt(ar);
-          boxHeight = minSize / sqrt(ar);
-          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-          // set the variance.
-          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-        }
-      }
-    }
-  }
-
-  // clip the prior's coordidate such that it is within [0, 1]
-  for (int d = 0; d < dim * 2; ++d)
-    if ((d % 8) < 4)
-      tmpPtr[d] = std::min(std::max(tmpPtr[d], (real)0.), (real)1.);
-  MatrixPtr outV = getOutputValue();
-  outV->copyFrom(buffer_->data_, dim * 2);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
deleted file mode 100644
index 88a41355cfce711e1e9522655058d0f1198e4e76..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Projection.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "ModelConfig.pb.h"
-#include "paddle/parameter/Parameter.h"
-
-namespace paddle {
-
-// Macro for registering a projection type
-// Example: REGISTER_LAYER(fc, FullMatrixProjection);
-#define REGISTER_PROJECTION(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                 \
-    Projection::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-#define REGISTER_PROJECTION_CREATE_FUNC(__type_name, createFunction)    \
-  static InitFunction __reg_type_##__type_name([]() {                   \
-    Projection::registrar_.registerClass(#__type_name, createFunction); \
-  })
-
-/**
- * A projection takes one Argument as input, calculate the result and add it
- * to output Argument.
- */
-class Projection {
- public:
-  static Projection* create(const ProjectionConfig& config,
-                            ParameterPtr parameter,
-                            bool useGpu);
-
-  Projection(const ProjectionConfig& config,
-             ParameterPtr parameter,
-             bool useGpu)
-      : config_(config), parameter_(parameter), useGpu_(useGpu) {}
-
-  virtual ~Projection() {}
-
-  const std::string& getName() const { return config_.name(); }
-
-  /// Register a projection
-  static ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
-      registrar_;
-
-  /**
-   * Forward propagation. If backward() will be called, in and out must be kept
-   * valid until then.
-   * @param in input of projection
-   * @param out output of projection
-   * @param passType PASS_TRAIN of PASS_TEST
-   */
-  void forward(const Argument* in, const Argument* out, PassType passType) {
-    in_ = in;
-    out_ = out;
-    passType_ = passType;
-    forward();
-  }
-
-  virtual void prefetch(const Argument* in) {}
-  virtual void forward() = 0;
-  virtual void backward(const UpdateCallback& callback) = 0;
-
-  /**
-   * See comment in Layer.h for the function with the same name.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Get layer state. A copy of internal state is returned.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
-  /**
-   * init forward_ and backward_ functions
-   */
-  virtual bool init() { return true; }
-
-  /**
-   * Get output size of projection.
-   */
-  size_t getOutputSize() const { return config_.output_size(); }
-
- protected:
-  /**
-   * Create layer function. Function is called in forward or backward.
-   * \param function, Layer::forward_ or Layer::backward_
-   * \param name, function name
-   * \param config, initialization configuration for the function
-   */
-  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
-                      const std::string& name,
-                      const FuncConfig& config) {
-    if (useGpu_) {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
-    } else {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
-    }
-    auto& func = function.back();
-    func->init(config);
-  }
-
- protected:
-  /// Config of projection
-  ProjectionConfig config_;
-  /// Parameter of projection
-  ParameterPtr parameter_;
-  bool useGpu_;
-
-  /// Store `in` passed to forward()
-  const Argument* in_;
-  /// Store `out` passed to forward()
-  const Argument* out_;
-  /// Store `passType` passed to forward()
-  PassType passType_;
-  /// Layer forward function
-  std::vector<std::shared_ptr<FunctionBase>> forward_;
-  /// Layer backward function
-  std::vector<std::shared_ptr<FunctionBase>> backward_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
deleted file mode 100644
index 6694e8f2996fdd2c98da1507e5fb3b90b271c850..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <functional>
-#include "paddle/gserver/layers/Layer.h"
-
-#include "paddle/gserver/gradientmachines/RecurrentGradientMachine.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * Recurrent layer group is a group of layers, which forward/backward one frame
- * after previous frame forward/backward through all layers in layer group.
- * It's automatically added by config_parser if some layers are defined
- * between RecurrentLayerGroupBegin and RecurrentLayerGroupEnd.
- */
-class RecurrentLayerGroup : public Layer {
- public:
-  explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
-
-  void initSubNetwork(NeuralNetwork* rootNetwork,
-                      const ModelConfig& config,
-                      const std::vector<ParameterType>& parameterTypes,
-                      bool useGpu) override;
-
-  void forward(PassType passType) override {
-    REGISTER_TIMER_INFO("RecurrentGroupFwTime", getName().c_str());
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    network_->forward(inArgs, &outArgs, passType);
-  }
-  void backward(const UpdateCallback& callback) override {
-    REGISTER_TIMER_INFO("RecurrentGroupBwTime", getName().c_str());
-    network_->backward(nullptr);
-
-    for (auto& para : parameters_) {
-      para->incUpdate(callback);
-    }
-  }
-
-  /**
-   * @see Layer.accessSubNetwork
-   */
-  void accessSubNetwork(
-      const std::function<void(NeuralNetwork&)>& callback) override {
-    callback(*network_);
-  }
-
- private:
-  std::unique_ptr<RecurrentGradientMachine> network_;
-};
-
-REGISTER_LAYER(recurrent_layer_group, RecurrentLayerGroup);
-
-void RecurrentLayerGroup::initSubNetwork(
-    NeuralNetwork* rootNetwork,
-    const ModelConfig& config,
-    const std::vector<ParameterType>& parameterTypes,
-    bool useGpu) {
-  setNeedGradient(true);
-
-  network_.reset(new RecurrentGradientMachine(config_.name(), rootNetwork));
-  ParamInitCallback cb = [rootNetwork](int paramId, Parameter* para) {
-    para->enableSharedType(
-        PARAMETER_VALUE,
-        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_VALUE),
-        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_VALUE));
-    para->enableSharedType(
-        PARAMETER_GRADIENT,
-        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_GRADIENT),
-        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_GRADIENT));
-  };
-  network_->init(config, cb, parameterTypes, useGpu);
-
-  for (auto paramId : network_->getParameterIds()) {
-    ParameterPtr parameter = rootNetwork->getParameters()[paramId];
-    parameter->incShared();
-    CHECK_EQ(parameter->getDeviceId(), getDeviceId());
-    parameters_.push_back(parameter);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
deleted file mode 100644
index d4ae9945934a40719d253d4b53915530423448af..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief A layer for resizing a minibatch matrix h*w to h'*w'
- * @note
- * origin matrix height * width)
- * resize matrix: (height * width / size) * size
- */
-class ResizeLayer : public Layer {
- public:
-  explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-};
-
-REGISTER_LAYER(resize, ResizeLayer);
-
-bool ResizeLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void ResizeLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  const Argument& input = getInput(0);
-  size_t height = input.value->getHeight();
-  size_t width = input.value->getWidth();
-  CHECK_EQ((height * width) % getSize(), 0UL);
-
-  reserveOutput(height * width / getSize(), getSize());
-  MatrixPtr tmp =
-      Matrix::create(output_.value->getData(), height, width, false, useGpu_);
-  tmp->assign(*input.value);
-}
-
-void ResizeLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
-  size_t height = input.value->getHeight();
-  size_t width = input.value->getWidth();
-
-  if (!input.grad) {
-    return;
-  }
-
-  MatrixPtr tmp = Matrix::create(input.grad->getData(),
-                                 height * width / getSize(),
-                                 getSize(),
-                                 false,
-                                 useGpu_);
-  tmp->add(*output_.grad);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/RotateLayer.h b/paddle/gserver/layers/RotateLayer.h
deleted file mode 100644
index 7ecbff20167dd95f782f2d61dc34697ab3273934..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/RotateLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * A layer for rotating a multi-channel feature map (M x N x C) in the spatial
- * domain
- * The rotation is 90 degrees in clock-wise for each channel
- * \f[
- *   y(j,i,:) = x(M-i-1,j,:)
- * \f]
- * where \f$x\f$ is (M x N x C) input, and \f$y\f$ is (N x M x C) output.
- *
- * The config file api is rotate_layer
- *
- */
-
-class RotateLayer : public Layer {
- public:
-  explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
-
- private:
-  int batchSize_;
-  int size_;
-  int height_;
-  int width_;
-  int channels_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
deleted file mode 100644
index 15e07daebee194a789da52d37a192e031348300c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for each row of a matrix, multiplying with a element of a vector,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y.row[i] = w[i] * x.row[i]
- * \f]
- * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is
- * (batchSize x 1) weight vector, and \f$y\f$ is (batchSize x dataDim) output.
- *
- * The config file api is scaling_layer.
- */
-
-class ScalingLayer : public Layer {
- public:
-  explicit ScalingLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ScalingLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(scaling, ScalingLayer);
-
-bool ScalingLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void ScalingLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(dataDim, getSize());
-  CHECK_EQ(weightV->getWidth(), 1U);
-  CHECK_EQ(weightV->getHeight(), batchSize);
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwScalingTimer", getName().c_str());
-    // outV += inV1 * weight
-    outV->addRowScale(0, *inV1, *weightV);
-  }
-}
-
-void ScalingLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outG = getOutputGrad();
-
-  {
-    REGISTER_TIMER_INFO("BwScalingTimer", getName().c_str());
-
-    if (inG0) {
-      // inG0 += outG .* inV1
-      inG0->rowDotMul(0, *outG, *inV1);
-    }
-
-    if (inG1) {
-      // inG1 += outG * weight;
-      inG1->addRowScale(0, *outG, *weightV);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
deleted file mode 100644
index 43c98993f3f6f74c034c59176378c3ea97a9c19b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SelectiveFullyConnectedLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(selective_fc, SelectiveFullyConnectedLayer);
-
-bool SelectiveFullyConnectedLayer::init(const LayerMap& layerMap,
-                                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  inputNum_ = inputLayers_.size();
-  if (config_.has_selected_colums()) {
-    inputNum_ -= 1;
-  }
-  for (size_t i = 0; i < inputNum_; i++) {
-    size_t height = inputLayers_[i]->getSize();
-    size_t width = getSize();
-    // NOTE weight is transpoed
-    weights_.emplace_back(new Weight(width, height, parameters_[i]));
-  }
-
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  fullOutput_ = false;
-
-  return true;
-}
-
-void SelectiveFullyConnectedLayer::prefetch() {}
-
-void SelectiveFullyConnectedLayer::reserveOutput(size_t height,
-                                                 size_t width,
-                                                 size_t nnz) {
-  bool flag = (passType_ == PASS_TEST &&
-               config_.selective_fc_pass_generation() && !fullOutput_);
-  SetDevice device(output_.deviceId);
-  if (flag) {
-    // output_.value is sparse matrix
-    if (dynamic_cast<CpuMatrix*>(output_.value.get()) ||
-        dynamic_cast<GpuMatrix*>(output_.value.get())) {
-      output_.value = nullptr;
-    }
-    Matrix::resizeOrCreateSparseMatrix(output_.value,
-                                       height,
-                                       width,
-                                       nnz,
-                                       FLOAT_VALUE,
-                                       SPARSE_CSR,
-                                       /*trans=*/false,
-                                       /*useGpu=*/useGpu_);
-    output_.value->copyFrom(*selCols_);
-    interOutput_ = output_.value;
-  } else {
-    if (fullOutput_) {
-      // output_.value is dense matrix
-      if (dynamic_cast<CpuSparseMatrix*>(output_.value.get()) ||
-          dynamic_cast<GpuSparseMatrix*>(output_.value.get())) {
-        output_.value = nullptr;
-      }
-      Matrix::resizeOrCreate(output_.value,
-                             height,
-                             width,
-                             /*trans=*/false,
-                             /*useGpu=*/useGpu_);
-      interOutput_ = output_.value;
-    } else {
-      // output_.value is dense matrix, but width = nnz /height
-      CHECK_EQ(nnz % height, 0U);
-      CHECK(nnz / height);
-      Matrix::resizeOrCreate(output_.value,
-                             height,
-                             nnz / height,
-                             /*trans=*/false,
-                             /*useGpu=*/useGpu_);
-      interOutput_ = Matrix::createSparseMatrix(output_.value->getData(),
-                                                selCols_->getRows(),
-                                                selCols_->getCols(),
-                                                height,
-                                                width,
-                                                nnz,
-                                                FLOAT_VALUE,
-                                                SPARSE_CSR,
-                                                /*trans=*/false,
-                                                /*useGpu=*/useGpu_);
-    }
-  }
-  interOutput_->zeroMem();
-
-  if (passType_ != PASS_TEST && needGradient()) {
-    CHECK_EQ(nnz % height, 0U) << "during training, each sample must have a "
-                                  "same number of selected columns.";
-    CHECK(nnz / height)
-        << "during training, "
-           "each sample must have at least one column selected.";
-    Matrix::resizeOrCreate(output_.grad,
-                           height,
-                           nnz / height,
-                           /*trans=*/false,
-                           /*useGpu=*/useGpu_);
-    output_.grad->zeroMem();
-  }
-}
-
-void SelectiveFullyConnectedLayer::forward(PassType passType) {
-  REGISTER_TIMER("selective_fc.forward");
-  Layer::forward(passType);
-
-  getSelectiveCols();
-  size_t height = getInput(0).getBatchSize();
-  size_t width = getSize();
-  size_t nnz = height * width;
-  if (!fullOutput_) {
-    CHECK(selCols_);
-    CHECK(height == selCols_->getHeight());
-    CHECK(width == selCols_->getWidth());
-    nnz = selCols_->getElementCnt();
-  }
-
-  // Layer::ResetOutput(), here we set outV/outG as SparseMatrix manually
-  // this outV should be used as input of MaxIdLayer and softmax activation
-  reserveOutput(height, width, nnz);
-
-  bool flag = true;
-  for (size_t i = 0; i < inputNum_; i++) {
-    MatrixPtr input = getInputValue(i);
-    MatrixPtr weight = weights_[i]->getW();
-    size_t hsize = input->getHeight();
-    size_t wsize = weight->getHeight();
-    real scaleT = i == 0 ? real(0) : real(1);
-
-    flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() &&
-           !fullOutput_;
-    if (flag) {
-      // if the indecies are highly sparse,
-      // manully compute the multiplication of
-      // the input vector and the selected rows.
-      REGISTER_TIMER("selective.plain");
-      interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
-    } else {
-      // if the indecies is not sparse enough,
-      // use full mul instead
-      REGISTER_TIMER("selective.mul");
-      if (fullOutput_) {
-        interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
-      } else {
-        Matrix::resizeOrCreate(mmat_,
-                               hsize,
-                               wsize,
-                               /*trans=*/false,
-                               /*useGpu=*/useGpu_);
-        mmat_->mul(*input, *weight->getTranspose());
-        interOutput_->add3(mmat_);
-      }
-    }
-  }
-
-  if (biases_) {
-    interOutput_->addBias(*(biases_->getW()), 1);
-  }
-
-  flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() &&
-          !fullOutput_);
-  if (flag) {
-    // during generation, output of this layer is a sparse csr matrix,
-    // which is probably the input of maxid layer
-    // if the model is trained with multi-class-cross-entroy-with-selfnorm,
-    // activiation of this layer should be exponential, not softmax.
-
-    Argument arg;
-    arg.value = Matrix::create(interOutput_->getData(),
-                               1,
-                               nnz,
-                               /*trans=*/false,
-                               /*useGpu=*/useGpu_);
-    //! TODO(yuyang18): Why we cannot invoke forwardActivation here?
-    activation_->forward(arg).check();
-  } else /* train and test in train, not generating */ {
-    // during training, this layer output value is *Matrix*, which is input of
-    // eg. multi-class-cross-entropy
-
-    // while training, every sample has a equal number of selected
-    // columns to be activated.
-    // note indices of multi-class-cross-entropy need to be remapped
-    // to this index.
-    // e.g. sample = [1,3,5] and 3 is gold, then label is 1
-
-    forwardActivation();
-  }
-}
-
-void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
-  backwardActivation();
-  MatrixPtr oGrad = getOutputGrad();
-  if (!fullOutput_) {
-    interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(),
-                                               interOutput_->getRows(),
-                                               interOutput_->getCols(),
-                                               interOutput_->getHeight(),
-                                               interOutput_->getWidth(),
-                                               interOutput_->getElementCnt(),
-                                               FLOAT_VALUE,
-                                               SPARSE_CSR,
-                                               /*trans=*/false,
-                                               /*useGpu=*/useGpu_);
-  } else {
-    interOutGrad_ = Matrix::create(oGrad->getData(),
-                                   oGrad->getHeight(),
-                                   oGrad->getWidth(),
-                                   /*trans=*/false,
-                                   /*useGpu=*/useGpu_);
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*interOutGrad_, 1);
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  // backward is different from FullyConnectedLayer
-  // because the weight is transposed
-  for (size_t i = 0; i < inputNum_; i++) {
-    AsyncGpuBlock block;
-    MatrixPtr preGrad = getInputGrad(i);
-    if (preGrad) {
-      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-      preGrad->mul(*interOutGrad_, *weights_[i]->getW(), 1, 1);
-    }
-
-    MatrixPtr wGrad = weights_[i]->getWGrad();
-    if (wGrad) {
-      REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-      MatrixPtr input = getInputValue(i);
-      wGrad->mul(*interOutGrad_->getTranspose(), *input, 1, 1);
-    }
-
-    {
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
-    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates) {
-  if (candidates == nullptr) {
-    fillFullySelectiveData();
-    return;
-  }
-
-  size_t sampleNum = candidates->size();
-  size_t outputWidth = getSize();
-  size_t nnz =
-      std::accumulate(candidates->begin(),
-                      candidates->end(),
-                      0UL,
-                      [](size_t a, const std::pair<int*, size_t>& arr) {
-                        return a + arr.second;
-                      });
-
-  Matrix::resizeOrCreateSparseMatrix(this->cpuSelCols_,
-                                     sampleNum,
-                                     outputWidth,
-                                     nnz,
-                                     NO_VALUE,
-                                     SPARSE_CSR,
-                                     false,
-                                     false);
-  CHECK(this->cpuSelCols_ != nullptr);
-  CpuSparseMatrixPtr selCols =
-      std::dynamic_pointer_cast<CpuSparseMatrix>(cpuSelCols_);
-  int* rowOffsets = selCols->getRows();
-  int* colIndices = selCols->getCols();
-
-  rowOffsets[0] = 0;
-  int idx = 0;
-  for (size_t i = 0; i < sampleNum; ++i) {
-    if ((*candidates)[i].second > 0) {
-      rowOffsets[i + 1] = rowOffsets[i] + (*candidates)[i].second;
-      for (size_t j = 0; j < (*candidates)[i].second; ++j) {
-        colIndices[idx] = (*candidates)[i].first[j];
-        idx++;
-      }
-    } else {
-      rowOffsets[i + 1] = rowOffsets[i];
-    }
-  }
-
-  CHECK_EQ(static_cast<size_t>(rowOffsets[sampleNum]), nnz);
-  if (!useGpu_) {
-    this->selCols_ = this->cpuSelCols_;
-  } else {
-    Matrix::resizeOrCreateSparseMatrix(this->selCols_,
-                                       sampleNum,
-                                       outputWidth,
-                                       nnz,
-                                       NO_VALUE,
-                                       SPARSE_CSR,
-                                       false,
-                                       true);
-    this->selCols_->copyFrom(*cpuSelCols_, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-  }
-
-  fullOutput_ = false;
-}
-
-void paddle::SelectiveFullyConnectedLayer::getSelectiveCols() {
-  if (config_.has_selected_colums()) {
-    this->selCols_ = inputLayers_[inputNum_]->getOutputValue();
-    fullOutput_ = false;
-  } else if (!config_.selective_fc_pass_generation() || selCols_ == nullptr) {
-    this->fillFullySelectiveData();
-  }  // else selCols_ is initialized by fillSelectiveData
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
deleted file mode 100644
index 4b32ce8b162c2a8b1a6c34adc0885a7701f5f91e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief The SelectiveFullyConnectedLayer class
- *
- * SelectiveFullyConnectedLayer differs from FullyConnectedLayer by that it
- * requires an additional input to indicate several selected columns, and only
- * compute the multiplications between the input matrices and the selected
- * columns of the parameter matrices of this layer. If the selected columns is
- * not specified, SelectiveFullyConnected layer acts exactly like
- * FullyConnectedLayer.
- *
- * The config file api is selective_fc_layer.
- */
-class SelectiveFullyConnectedLayer : public Layer {
- protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
- private:
-  /**
-   * Get selected columns each forward.
-   */
-  void getSelectiveCols();
-
-  MatrixPtr mmat_;
-  /// cpuSelCols_ is a CpuSparseMatrix, used to save selected columns.
-  MatrixPtr cpuSelCols_;
-  /// CpuSparseMatrix or GpuSparseMatrix. In CPU mode, selCols_ points
-  /// to cpuSelCols_.
-  MatrixPtr selCols_;
-  size_t inputNum_;
-
-  /// interOutput_ shared same memory with output_.value.
-  MatrixPtr interOutput_;
-
-  /// if fullOutput_ is false, interOutGrad_ sparse matrix
-  MatrixPtr interOutGrad_;
-
-  /// if true, means output_.value is the same as Fc Layer
-  bool fullOutput_;
-
- public:
-  explicit SelectiveFullyConnectedLayer(const LayerConfig& config)
-      : Layer(config), selCols_(nullptr) {}
-
-  ~SelectiveFullyConnectedLayer() {}
-  void prefetch() override;
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  /**
-   * @brief Resize the output matrix size.
-   * And reset value to zero
-   */
-  void reserveOutput(size_t height, size_t width, size_t nnz);
-
-  /**
-   * @brief Fill candidates to select several activations as output.
-   * @param candidates specifies several selected columns of the parameter
-   * matrices of this layer.
-   * Multiplications only between the input matrices and the selected columns
-   * are computed.
-   * If the candidates is a nullptr, selective fc layer acts exactly like the
-   * fully connected layer.
-   * @note CURRENTLY, THIS METHOD IS ONLY USED FOR BEAM SEARCH
-   */
-  void fillSelectiveData(
-      const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates);
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /**
-   * @brief Make SelectiveFC act as FullyConnectedLayer
-   */
-  void fillFullySelectiveData() { fullOutput_ = true; }
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
deleted file mode 100644
index c84c3ce4f080cc19f4937f04585accb5b2b347f9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for concatenating the first sequence with the second sequence
- * Input: two sequences each containing the same number of instances
- *        seq1 = [a1, a2, ..., an]
- *        seq2 = [b1, b2, ..., bn]
- * Output: a concatenated sequence of the two input sequences
- *        out = [a1, b1, a2, b2, ..., an, bn]
- */
-
-class SequenceConcatLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit SequenceConcatLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~SequenceConcatLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqconcat, SequenceConcatLayer);
-
-bool SequenceConcatLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // sequene concatenation layer should have exactly 2 inputs
-  CHECK_EQ(2U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceConcatLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t dim = getSize();
-
-  const Argument& input1 = getInput(0);
-  size_t numSequences1 = input1.getNumSequences();
-  auto startPositions1 = input1.sequenceStartPositions->getVector(false);
-
-  const Argument& input2 = getInput(1);
-  size_t numSequences2 = input2.getNumSequences();
-  auto startPositions2 = input2.sequenceStartPositions->getVector(false);
-
-  CHECK_EQ(dim, input1.value->getWidth());
-  CHECK_EQ(startPositions1->getData()[numSequences1], input1.getBatchSize());
-  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
-
-  CHECK_EQ(dim, input2.value->getWidth());
-  CHECK_EQ(startPositions2->getData()[numSequences2], input2.getBatchSize());
-  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
-
-  CHECK_EQ(numSequences1, numSequences2);
-
-  MatrixPtr inputValue1 = getInputValue(0);
-  MatrixPtr inputValue2 = getInputValue(1);
-
-  // reset output
-  reserveOutput(inputValue1->getHeight() + inputValue2->getHeight(), dim);
-
-  MatrixPtr outputValue = getOutputValue();
-
-  const int* starts1 = startPositions1->getData();
-  const int* starts2 = startPositions2->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceConcatLayerForward", getName().c_str());
-
-    size_t offset = 0;
-    size_t leftNumIns = 0;
-    size_t rightNumIns = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      leftNumIns = starts1[seqId + 1] - starts1[seqId];
-      outputValue->subMatrix(offset, leftNumIns)
-          ->assign(*(inputValue1->subMatrix(starts1[seqId], leftNumIns)));
-      offset += leftNumIns;
-
-      rightNumIns = starts2[seqId + 1] - starts2[seqId];
-      outputValue->subMatrix(offset, rightNumIns)
-          ->assign(*(inputValue2->subMatrix(starts2[seqId], rightNumIns)));
-      offset += rightNumIns;
-    }
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences1 + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-
-    for (size_t seqId = 0; seqId < numSequences1 + 1; ++seqId) {
-      tgtBuf[seqId] = starts1[seqId] + starts2[seqId];
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SequenceConcatLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad1 = getInputGrad(0);
-  MatrixPtr inputGrad2 = getInputGrad(1);
-  MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
-  auto startPositions2 = getInput(1).sequenceStartPositions->getVector(false);
-
-  size_t numSequences1 = startPositions1->getSize() - 1;
-  size_t numSequences2 = startPositions2->getSize() - 1;
-
-  CHECK_EQ(numSequences1, numSequences2);
-
-  const int* starts1 = startPositions1->getData();
-  const int* starts2 = startPositions2->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceConcatLayerBackward", getName().c_str());
-
-    size_t offset = 0;
-    size_t leftNumIns = 0;
-    size_t rightNumIns = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      leftNumIns = starts1[seqId + 1] - starts1[seqId];
-      if (inputGrad1) {
-        inputGrad1->subMatrix(starts1[seqId], leftNumIns)
-            ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
-      }
-      offset += leftNumIns;
-
-      rightNumIns = starts2[seqId + 1] - starts2[seqId];
-      if (inputGrad2) {
-        inputGrad2->subMatrix(starts2[seqId], rightNumIns)
-            ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
-      }
-      offset += rightNumIns;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
deleted file mode 100644
index 28d0a9296d4accd4152e886ccae12a776fdb8f7f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Logging.h"
-
-#include "SequencePoolLayer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for extracting the last instance of the input sequence.
- * Input: a sequence
- * If SequenceLevel = kNonseq:
- *   Output: a sequence containing only the last instance of the input sequence
- *   If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and getting last instance
- *              operation is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *   Check input sequence must has sub-sequence
- *   Output: a sequence containing only the last instance of each sub-sequence
- *           of the input sequence
- *
- * The config file api is last_seq and first_seq.
- */
-
-class SequenceLastInstanceLayer : public SequencePoolLayer {
- protected:
-  MatrixPtr tmpSrc_;
-  MatrixPtr tmpDest_;
-  std::vector<int> instanceIds_;
-
- public:
-  explicit SequenceLastInstanceLayer(const LayerConfig& config)
-      : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
-
-bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
-                                     const ParameterMap& parameterMap) {
-  SequencePoolLayer::init(layerMap, parameterMap);
-  reversed_ = config_.select_first();
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-
-  return true;
-}
-
-void SequenceLastInstanceLayer::forward(PassType passType) {
-  SequencePoolLayer::forward(passType);
-
-  auto starts = startPositions_->getData(false);
-  MatrixPtr inputValue = getInputValue(0);
-  MatrixPtr outputValue = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
-
-    instanceIds_.clear();
-    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
-      int insId = reversed_ ? starts[seqId] : starts[seqId + 1] - 1;
-      instanceIds_.push_back(insId);
-
-      outputValue->subMatrix(seqId, 1, tmpDest_)
-          ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    outputValue->addBias(*(biases_->getW()), 1);
-  }
-
-  /*  activation, should set to 'linear' in most cases */
-  forwardActivation();
-}
-
-void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
-  SequencePoolLayer::backward(callback);
-
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  if (inputGrad) {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
-
-    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
-      inputGrad->subMatrix(instanceIds_[seqId], 1, tmpDest_)
-          ->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
deleted file mode 100644
index 01183060afd58376bb718dda64d8106cce4899f9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
- *
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = seqlastin/average/max_{for each instance in this
- * sequence}{input[i]}
- *    If stride_ > 0:
- *        Check input sequence must not have sub-sequence
- *        Output: a shorten sequence. Stride is the step size by which we slide
- *                a window upon the input sequence, and the pooling operation
- *                is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = seqlastin/average/max_{for each instance in this
- * sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-
-class SequencePoolLayer : public Layer {
- protected:
-  int type_;
-  std::unique_ptr<Weight> biases_;
-  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
-  size_t newBatchSize_;
-  ICpuGpuVectorPtr startPositions_;
-  int stride_;
-  // Whether the input sequence is reversed or not.
-  bool reversed_ = false;
-
- public:
-  explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
deleted file mode 100644
index 319310af8c4ac3bdefd814ad05b7fde6070f2340..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- *  A layer for reshaping the sequence. Assume the input sequence has
- *  T instances, the dimension of each instance is M, and the input
- *  reshape_dim is N, then the output sequence has T*M/N instances,
- *  the dimension of each instance is N.
- *
- *  Note that T*M/N must be an integer.
- */
-
-class SequenceReshapeLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-
-  MatrixPtr reshapedOutputGrad;
-
- public:
-  explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqreshape, SequenceReshapeLayer);
-
-bool SequenceReshapeLayer::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(1U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceReshapeLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-
-  size_t inDim = input.value->getWidth();
-  size_t outDim = getSize();
-
-  size_t numSequences = input.getNumSequences();
-
-  // by default, we assume each instance as a sequence
-  IVectorPtr seqStarts;
-  IVector::resizeOrCreate(seqStarts, input.getBatchSize() + 1, false);
-  int* startsData = seqStarts->getData();
-  for (int i = 0; i < input.getBatchSize() + 1; i++) {
-    startsData[i] = i;
-  }
-  const int* starts = startsData;
-
-  // if there is sequence, then use start positions
-  if (input.sequenceStartPositions) {
-    auto startPositions = input.sequenceStartPositions->getVector(false);
-    starts = startPositions->getData();
-    CHECK_EQ(starts[numSequences], input.getBatchSize());
-    CHECK_EQ(numSequences, startPositions->getSize() - 1);
-  }
-
-  for (size_t seqID = 0; seqID < numSequences; seqID++) {
-    size_t inNumIns = starts[seqID + 1] - starts[seqID];
-    size_t outNumIns = inNumIns * inDim / outDim;
-    CHECK_EQ(outNumIns * outDim, inNumIns * inDim);
-  }
-
-  MatrixPtr inputValue = getInputValue(0);
-
-  // reset output
-  reserveOutput(inputValue->getHeight() * inDim / outDim, outDim);
-  MatrixPtr outputValue = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceReshapeLayerForward", getName().c_str());
-
-    outputValue->copyFrom(*inputValue);
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-
-    for (size_t seqId = 0; seqId < numSequences + 1; ++seqId) {
-      tgtBuf[seqId] = starts[seqId] * inDim / outDim;
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SequenceReshapeLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  AsyncGpuBlock asyncGpuBlock;
-  REGISTER_TIMER_INFO("SequenceReshapeLayerBackward", getName().c_str());
-
-  if (inputGrad) {
-    Matrix::resizeOrCreate(reshapedOutputGrad,
-                           inputGrad->getHeight(),
-                           inputGrad->getWidth(),
-                           false,
-                           useGpu_);
-    reshapedOutputGrad->copyFrom(*outputGrad);
-    inputGrad->add(*reshapedOutputGrad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceSliceLayer.cpp b/paddle/gserver/layers/SequenceSliceLayer.cpp
deleted file mode 100644
index a6d810b583aab6e44faa583795686f06e17beeb9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceSliceLayer.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-class SequenceSliceLayer : public Layer {
- public:
-  explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /*
-   * TODO(caoying)
-   * In PaddePaddle, currently all matrices are real number types,
-   * but the second and the (optional) third input which are some
-   * selected indices of the give sequence to trim the sequence, are actually
-   * filled with int types so that storing int types information in real number
-   * matrices is very dangerous, since real numbers will be convered to int
-   * types. If a user fills this matrix himself, invalid data may occor.
-   */
-
-  MatrixPtr startIdsOnCpu_;
-  MatrixPtr endIdsOnCpu_;
-
-  std::vector<int> selectedRows_;
-  IVectorPtr rowIndice_;
-  std::vector<std::vector<int>> inputSeqInfoVec_;
-  std::vector<int> outSubSeqStartPos_;
-  std::vector<int> outSeqStartPos_;
-
-  void checkInputs();
-  void copySliceIdsToCpu();
-  void calSelectedRows(const MatrixPtr starts, const MatrixPtr ends);
-};
-
-REGISTER_LAYER(seq_slice, SequenceSliceLayer);
-
-bool SequenceSliceLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_GE(inputLayers_.size(), 2U);
-  CHECK_LE(inputLayers_.size(), 3U);
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceSliceLayer::checkInputs() {
-  const Argument& inputSeq = getInput(0);
-  CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
-                           << "must be a sequence.";
-  const MatrixPtr indices1 = getInputValue(1);
-  CHECK_EQ(
-      indices1->getHeight(),
-      static_cast<size_t>(inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
-                                               : inputSeq.getNumSequences()))
-      << "Height of the second input should be equal to number of sequence "
-      << "in the first input.";
-  if (inputLayers_.size() == 3) {
-    const MatrixPtr indices2 = getInputValue(2);
-    CHECK_EQ(indices2->getHeight(), indices1->getHeight())
-        << "start indices and end indices should have the same height.";
-    CHECK_EQ(indices2->getWidth(), indices1->getWidth())
-        << "start indices and end indices should have the same Width.";
-  }
-}
-
-void SequenceSliceLayer::copySliceIdsToCpu() {
-  const MatrixPtr indices1 = getInputValue(1);
-  if (inputLayers_.size() == 2U) {
-    if (config_.select_first()) {
-      Matrix::resizeOrCreate(startIdsOnCpu_,
-                             indices1->getHeight(),
-                             indices1->getWidth(),
-                             false /* trans */,
-                             false /* useGpu */);
-      startIdsOnCpu_->copyFrom(*indices1);
-      endIdsOnCpu_ = nullptr;
-    } else {
-      Matrix::resizeOrCreate(endIdsOnCpu_,
-                             indices1->getHeight(),
-                             indices1->getWidth(),
-                             false /* trans */,
-                             false /* useGpu */);
-      endIdsOnCpu_->copyFrom(*indices1);
-      startIdsOnCpu_ = nullptr;
-    }
-  } else if (inputLayers_.size() == 3U) {
-    Matrix::resizeOrCreate(startIdsOnCpu_,
-                           indices1->getHeight(),
-                           indices1->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    startIdsOnCpu_->copyFrom(*indices1);
-
-    const MatrixPtr indices2 = getInputValue(2);
-    Matrix::resizeOrCreate(endIdsOnCpu_,
-                           indices2->getHeight(),
-                           indices2->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    endIdsOnCpu_->copyFrom(*indices2);
-  }
-}
-
-void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
-                                         const MatrixPtr ends) {
-  CHECK(starts || ends) << "At least one of the start or end indices "
-                        << "should be given.";
-
-  bool hasSubseq = getInput(0).hasSubseq();
-
-  outSeqStartPos_.resize(1, 0);
-  outSubSeqStartPos_.resize(1, 0);
-  selectedRows_.clear();
-
-  size_t beamSize = starts ? starts->getWidth() : ends->getWidth();
-  size_t rowIdx = 0;
-  for (size_t i = 0; i < inputSeqInfoVec_.size(); ++i) {
-    for (size_t j = 0; j < inputSeqInfoVec_[i].size() - 1; ++j) {
-      for (size_t k = 0; k < beamSize; ++k) {
-        if (starts && starts->getElement(rowIdx, k) == -1.) break;
-        if (ends && ends->getElement(rowIdx, k) == -1.) break;
-
-        int begPos = inputSeqInfoVec_[i][j];
-        if (starts) begPos += starts->getElement(rowIdx, k);
-
-        int endPos = inputSeqInfoVec_[i][j + 1] - 1;
-        if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
-
-        int seqLen = endPos - begPos + 1;
-        CHECK_GT(seqLen, 0);
-        for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
-        hasSubseq
-            ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
-            : outSeqStartPos_.push_back(outSeqStartPos_.back() + seqLen);
-      }
-      rowIdx++;
-    }
-    if (hasSubseq) outSeqStartPos_.push_back(outSubSeqStartPos_.back());
-  }
-
-  if (useGpu_) {
-    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
-    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
-  } else {
-    rowIndice_ =
-        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
-  }
-
-  // create the sequence information for the output.
-  ICpuGpuVector::resizeOrCreate(
-      output_.sequenceStartPositions, outSeqStartPos_.size(), false);
-  output_.sequenceStartPositions->copyFrom(
-      outSeqStartPos_.data(), outSeqStartPos_.size(), false);
-
-  if (hasSubseq) {
-    ICpuGpuVector::resizeOrCreate(
-        output_.subSequenceStartPositions, outSubSeqStartPos_.size(), false);
-    output_.subSequenceStartPositions->copyFrom(
-        outSubSeqStartPos_.data(), outSubSeqStartPos_.size(), false);
-  }
-}
-
-void SequenceSliceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  checkInputs();
-
-  const Argument& inputSeq = getInput(0);
-  inputSeqInfoVec_.clear();
-  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
-                              inputSeq.subSequenceStartPositions,
-                              inputSeqInfoVec_);
-  if (!useGpu_) {
-    if (inputLayers_.size() == 2U) {
-      startIdsOnCpu_ = config_.select_first() ? getInputValue(1) : nullptr;
-      endIdsOnCpu_ = config_.select_first() ? nullptr : getInputValue(1);
-    } else if (inputLayers_.size() == 3U) {
-      startIdsOnCpu_ = getInputValue(1);
-      endIdsOnCpu_ = getInputValue(2);
-    }
-  } else {
-    copySliceIdsToCpu();
-  }
-
-  /*
-   * calculate the selected row indices in a batch, and build the output
-   * sequence information.
-   */
-  calSelectedRows(startIdsOnCpu_, endIdsOnCpu_);
-
-  resetOutput(selectedRows_.size(), getSize());
-
-  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
-}
-
-void SequenceSliceLayer::backward(const UpdateCallback& callback) {
-  getOutputGrad()->addToRows(*getInputGrad(0), *rowIndice_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceToBatch.h b/paddle/gserver/layers/SequenceToBatch.h
deleted file mode 100644
index 5200e702d9bc947746567c19ca7d552750828131..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceToBatch.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-/*
- * This class can used to modify the matrix structure of sequence matrix into
- * batch structure.
- * sequence matrix: [C1_s ... Cn_s | ...... | C1_t ... Cn_t]
- * batch matrix:    [C1_s ... C1_t | ...... | Cn_s ... Cn_t]
- * Cn_s is the state for sequence s at time n.
- *
- * Exampel:  sequence matrix = {{0, 0, 0, 0}, {1, 1, 1, 1, 1}, {2, 2, 2}}
- *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
- *           batch matrix = {{1, 0, 2}, {1, 0, 2}, {1, 0, 2}, {1, 0}, {1}}
- *           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
- *
- * Use:
- * Input: seqMatrix, seqStarts(Sequence Start Positions)
- * Output: batchMatrix
- * 1. SequenceToBatch seq2batch;
- * 2. seq2batch.resizeOrCreateBatch(seqStarts);     // calculate seq2BatchIdx
- * 3. seq2batch.copy(seqMatrix, batchMatrix, true); // copy seq to batch matrix
- *
- */
-class SequenceToBatch {
- public:
-  explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
-
-  /* resize and calculate the batchIndex_ */
-  void resizeOrCreateBatch(int batchSize,
-                           size_t numSequences,
-                           const int *seqStarts,
-                           bool reversed,
-                           bool prevBatchState = false);
-
-  /* sequence matrix and batch matrix copy:
-   * seq2batch: copy(seqValue, batchValue, true);
-   * batch2seq: copy(seqValue, batchValue, false);
-   */
-  void copy(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
-  /* sequence/batch matrix add to batch/sequence matrix */
-  void add(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
-  MatrixPtr getBatchValue(Matrix &batchValue, int batchId, int numRows = 0);
-
-  size_t getNumBatch() const { return numBatch_; }
-
-  /* resize or create a batch matrix(batchValue_) */
-  void resizeOrCreate(Matrix &seqValue);
-  /* copy seqValue to batchValue_ */
-  void copyFromSeq(Matrix &seqValue);
-  /* copy batchValue_ to seqValue */
-  void copyBackSeq(Matrix &seqValue);
-  MatrixPtr getBatchValue(int batchId, int numRows = 0);
-  MatrixPtr getBatchValue() { return batchValue_; }
-  /*tranfer preBatchOutput to batch struct*/
-  void prevOutput2Batch(Matrix &src, Matrix &dst);
-  /*get sequence output from batch struct*/
-  void getSeqOutputFromBatch(Matrix &sequence, Matrix &batch);
-
-  /* Copy the index from another seq2batch. */
-  void shareIndexWith(const SequenceToBatch &seq2batch) {
-    CHECK(useGpu_ == seq2batch.useGpu_);
-    batchStartPositions_ = seq2batch.batchStartPositions_;
-    seq2BatchIdx_ = seq2batch.seq2BatchIdx_;
-    cpuSeq2BatchIdx_ = seq2batch.cpuSeq2BatchIdx_;
-    numBatch_ = seq2batch.numBatch_;
-  }
-
- protected:
-  void sequence2BatchCopy(Matrix &batch,
-                          Matrix &sequence,
-                          IVector &seq2BatchIdx,
-                          bool seq2batch);
-  void sequence2BatchAdd(Matrix &batch,
-                         Matrix &sequence,
-                         IVector &seq2BatchIdx,
-                         bool seq2batch);
-
-  IVectorPtr batchStartPositions_;
-  IVectorPtr seq2BatchIdx_;
-  IVectorPtr cpuSeq2BatchIdx_;
-  IVectorPtr cpuSeqIdx_;
-  IVectorPtr cpuSeqEndIdxInBatch_;
-  IVectorPtr seqIdx_;
-  IVectorPtr seqEndIdxInBatch_;
-  size_t numBatch_;
-  bool useGpu_;
-  MatrixPtr batchValue_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
deleted file mode 100644
index f7f4735c1b72d4ac6540714573fd7e15ef99ea5b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for applying a slope and an intercept to the input
- * element-wise.
- * This layer is used in NEURAL TURING MACHINE.
- * @note There is no activation and weight in this layer.
- *
- * \f[
- *    y = ax + b
- * \f]
- *
- * Here, a is scale and b is offset, which are provided as attributes of the
- * layer.
- *
- * The config file api is slope_intercept_layer.
- */
-
-class SlopeInterceptLayer : public Layer {
- public:
-  explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(slope_intercept, SlopeInterceptLayer);
-
-bool SlopeInterceptLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-
-  return true;
-}
-
-void SlopeInterceptLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-
-  /* malloc memory for the output_ if necessary */
-  size_t batchSize = inV->getHeight();
-  size_t size = getSize();
-
-  CHECK_EQ(size, inV->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwSlopeInterceptTimer", getName().c_str());
-    outV->mulScalar(*inV, config_.slope());
-    outV->add(config_.intercept());
-  }
-}
-
-void SlopeInterceptLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outG = getOutputGrad();
-
-  if (inG) {
-    REGISTER_TIMER_INFO("BwSlopeInterceptTimer", getName().c_str());
-    inG->add(*outG, config_.slope());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
deleted file mode 100644
index 421bdfe09c46f656f500daff195c755274bf8bb7..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "PoolProjection.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-/**
- * @brief A layer for spatial pyramid pooling on the input image by taking
- * the max, average, etc. within regions, so that the result vector of
- * different sized images are of the same size.
- *
- * The config file api is spp_layer.
- */
-
-class SpatialPyramidPoolLayer : public Layer {
- protected:
-  size_t channels_;
-  size_t imgSizeW_;
-  size_t imgSizeH_;
-  size_t pyramidHeight_;
-  std::string poolType_;
-
-  std::vector<std::unique_ptr<PoolProjection>> poolProjections_;
-  std::vector<Argument> projOutput_;
-  std::vector<std::pair<size_t, size_t>> projCol_;
-
- public:
-  explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  ProjectionConfig getConfig(size_t sizeX_,
-                             size_t sizeY_,
-                             size_t channels,
-                             size_t pyamidLevel_,
-                             std::string& poolType_);
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
deleted file mode 100644
index e2bb00bbfacb26dc736a63877119b379f22b5983..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SubNestedSequenceLayer.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-class SubNestedSequenceLayer : public Layer {
- public:
-  explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /*
-   * This functions generates the indices of rows in a batch according to the
-   * indices of selected sub-sequence in each sequence.
-   *
-   * Examples:
-   * selectedIndices:
-   *   [
-   *     [0, 1, -1],
-   *     [0, 1, 2],
-   *     [0, -1, -1],
-   *     [0, 2, 3],
-   *   ]
-   * inputSeqInfo:
-   *   [
-   *     [0,3,4],
-   *     [4,5,7,10,15],
-   *     [15,20],
-   *     [20,22,23,25,28]
-   *   ]
-   *
-   * ths output is saved to private member rowIndice_;
-   * [0,1,2,3,4,5,6,7,8,9,15,16,17,18,19,20,21,23,24,25,26,27]
-   */
-
-  void calSelectedRows(const MatrixPtr selectedIndices,
-                       const std::vector<std::vector<int>>& inputSeqInfo);
-
-  /*
-   * TODO(caoying)
-   * In PaddePaddle, currently all matrices are real number types,
-   * but the second is some selected indices of the give sequence to trim
-   * the nested sequence, are actually filled with int types so that storing
-   * int types information in real number matrices is very dangerous, since
-   * real numbers will be convered to int types. If a user fills this matrix
-   * himself, invalid data may occor.
-   *
-   * if the second input of this layer is on GPU memory, copy it to CPU memory.
-   */
-  MatrixPtr selIdsCpu_;
-
-  /*
-   * reorganize sequenceStartPositions and subSequenceStartPositions
-   * into a 2d vector to facilitate the sequence selection process.
-   */
-  std::vector<std::vector<int>> inputSeqInfoVec_;
-
-  /* store the final selected row indices in a batch */
-  IVectorPtr rowIndice_;
-  /* rowIndice_ and selectedRows_ actually share a same memory. */
-  std::vector<int> selectedRows_;
-};
-
-REGISTER_LAYER(sub_nested_seq, SubNestedSequenceLayer);
-
-bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(2U, inputLayers_.size());
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SubNestedSequenceLayer::calSelectedRows(
-    const MatrixPtr selectedIndices,
-    const std::vector<std::vector<int>>& inputSeqInfo) {
-  selectedRows_.clear();
-
-  std::vector<int> outSeqStartInfo(1, 0);
-  std::vector<int> outSubSeqStartInfo(1, 0);
-
-  size_t seqNum = selectedIndices->getHeight();
-  size_t beamSize = selectedIndices->getWidth();
-  for (size_t i = 0; i < seqNum; ++i) {
-    for (size_t j = 0; j < beamSize; ++j) {
-      if (selectedIndices->getElement(i, j) == -1.) break;
-      size_t selSubSeqIdx = selectedIndices->getElement(i, j);
-      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);
-
-      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
-                         inputSeqInfoVec_[i][selSubSeqIdx];
-      for (size_t k = 0; k < subSeqLen; ++k)
-        selectedRows_.push_back(inputSeqInfoVec_[i][selSubSeqIdx] + k);
-      outSubSeqStartInfo.push_back(outSubSeqStartInfo.back() + subSeqLen);
-    }
-    outSeqStartInfo.push_back(outSubSeqStartInfo.back());
-  }
-
-  if (useGpu_) {
-    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
-    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
-  } else {
-    rowIndice_ =
-        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
-  }
-
-  // create the sequence information for the output.
-  ICpuGpuVector::resizeOrCreate(
-      output_.sequenceStartPositions, outSeqStartInfo.size(), false);
-  output_.sequenceStartPositions->copyFrom(
-      outSeqStartInfo.data(), outSeqStartInfo.size(), false);
-
-  ICpuGpuVector::resizeOrCreate(
-      output_.subSequenceStartPositions, outSubSeqStartInfo.size(), false);
-  output_.subSequenceStartPositions->copyFrom(
-      outSubSeqStartInfo.data(), outSubSeqStartInfo.size(), false);
-}
-
-void SubNestedSequenceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& inputSeq = getInput(0);
-  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
-                              << "must be a nested sequence.";
-  const MatrixPtr selectedIndices = getInputValue(1);
-  CHECK_EQ(size_t(inputSeq.getNumSequences()), selectedIndices->getHeight());
-
-  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
-    /*
-     * Currently, the second input for this layer is generated by
-     * kmax_sequence_score_layer whose output is always stored on CPU,
-     * or a data_layer which canbe on GPU.
-     *
-     * If the second input is on GPU, copy it to CPU memory, because this
-     * input always uses very few memory, and operations related to it are
-     * all logic control, not computations.
-     */
-    Matrix::resizeOrCreate(selIdsCpu_,
-                           selectedIndices->getHeight(),
-                           selectedIndices->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    selIdsCpu_->copyFrom(*selectedIndices);
-  } else {
-    selIdsCpu_ = selectedIndices;
-  }
-
-  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
-                              inputSeq.subSequenceStartPositions,
-                              inputSeqInfoVec_);
-  calSelectedRows(selIdsCpu_, inputSeqInfoVec_);
-
-  resetOutput(selectedRows_.size(), getSize());
-  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
-}
-
-void SubNestedSequenceLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inputSeqGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  if (inputSeqGrad) outputGrad->addToRows(*inputSeqGrad, *rowIndice_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
deleted file mode 100644
index ba49f5710f9d0bb985cf1e80d5c4a972d8f046a6..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for taking the subsequence according to given offset and size
- * Input: original sequence, offset, size
- * Output: subsequence
- */
-
-class SubSequenceLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-  MatrixPtr tmpSrc_;
-  MatrixPtr tmpDest_;
-
- public:
-  explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(subseq, SubSequenceLayer);
-
-bool SubSequenceLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // sequene concatenation layer should have exactly 2 inputs
-  CHECK_EQ(3U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SubSequenceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t dim = getSize();
-
-  const Argument& input = getInput(0);
-  size_t numSequences1 = input.getNumSequences();
-  auto startPositions1 = input.sequenceStartPositions->getVector(false);
-
-  const Argument& offsetSeq = getInput(1);
-  size_t numSequences2 = offsetSeq.getNumSequences();
-  auto startPositions2 = offsetSeq.sequenceStartPositions->getVector(false);
-
-  const Argument& sizeSeq = getInput(2);
-  size_t numSequences3 = sizeSeq.getNumSequences();
-  auto startPositions3 = sizeSeq.sequenceStartPositions->getVector(false);
-
-  CHECK_EQ(dim, input.value->getWidth());
-
-  CHECK_EQ(startPositions1->getData()[numSequences1], input.getBatchSize());
-  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
-
-  CHECK_EQ(startPositions2->getData()[numSequences2], offsetSeq.getBatchSize());
-  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
-
-  CHECK_EQ(startPositions3->getData()[numSequences3], sizeSeq.getBatchSize());
-  CHECK_EQ(numSequences3, startPositions3->getSize() - 1);
-
-  CHECK_EQ(numSequences1, numSequences2);
-  CHECK_EQ(numSequences2, numSequences3);
-
-  MatrixPtr inputValue = input.value;
-  IVectorPtr offsetValue;
-  IVectorPtr sizeValue;
-
-  if (useGpu_) {
-    // copy to cpu
-    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
-    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
-    offsetValue->copyFrom(*offsetSeq.ids);
-    sizeValue->copyFrom(*sizeSeq.ids);
-  } else {
-    offsetValue = offsetSeq.ids;
-    sizeValue = sizeSeq.ids;
-  }
-
-  CHECK_EQ(offsetValue->getSize(), numSequences1);
-  CHECK_EQ(sizeValue->getSize(), numSequences1);
-
-  int* offsets = offsetValue->getData();
-  int* sizes = sizeValue->getData();
-
-  // get total height of output
-  size_t height = 0;
-  for (size_t seqId = 0; seqId < numSequences1; seqId++) {
-    height += sizes[seqId];
-  }
-
-  // reset output
-  resetOutput(height, dim);
-
-  MatrixPtr outputValue = getOutputValue();
-
-  const int* starts1 = startPositions1->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SubSequenceLayerForward", getName().c_str());
-
-    size_t offsetIn = 0;
-    size_t offsetOut = 0;
-    size_t size = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      offsetIn = starts1[seqId] + offsets[seqId];
-      size = sizes[seqId];
-
-      outputValue->subMatrix(offsetOut, size, tmpDest_)
-          ->assign(*(inputValue->subMatrix(offsetIn, size, tmpSrc_)));
-
-      offsetOut += size;
-    }
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences1 + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-    int offset = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      tgtBuf[seqId] = offset;
-      offset += sizes[seqId];
-    }
-    tgtBuf[numSequences1] = offset;
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SubSequenceLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad1 = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
-  size_t numSequences1 = startPositions1->getSize() - 1;
-  const int* starts1 = startPositions1->getData();
-
-  const Argument& offsetSeq = getInput(1);
-  const Argument& sizeSeq = getInput(2);
-  IVectorPtr offsetValue;
-  IVectorPtr sizeValue;
-
-  if (useGpu_) {
-    // copy to cpu
-    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
-    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
-    offsetValue->copyFrom(*offsetSeq.ids);
-    sizeValue->copyFrom(*sizeSeq.ids);
-  } else {
-    offsetValue = offsetSeq.ids;
-    sizeValue = sizeSeq.ids;
-  }
-
-  int* offsets = offsetValue->getData();
-  int* sizes = sizeValue->getData();
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SubSequenceLayerBackward", getName().c_str());
-
-    int offsetIn = 0;
-    int offsetOut = 0;
-    int size = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      offsetIn = starts1[seqId] + offsets[seqId];
-      size = sizes[seqId];
-
-      inputGrad1->subMatrix(offsetIn, size, tmpDest_)
-          ->add(*(outputGrad->subMatrix(offsetOut, size, tmpSrc_)));
-      offsetOut += size;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
deleted file mode 100644
index 00764717e8b6be30230e44626974033e929352da..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for sum-to-one normalization,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   out[i] = \frac {in[i]} {\sum_{k=1}^N in[k]}
- * \f]
- * where \f$in\f$ is a (batchSize x dataDim) input vector,
- * and \f$out\f$ is a (batchSize x dataDim) output vector.
- *
- * The config file api is sum_to_one_norm_layer.
- */
-
-class SumToOneNormLayer : public Layer {
- protected:
-  /// reciprocalRowSum_ = \f$1 / \sum_{k=1}^N in[k]\f$
-  MatrixPtr reciprocalRowSum_;
-  /// dotSum = output_.grad \f$.*\f$ output_.value
-  MatrixPtr dotSum_;
-
- public:
-  explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(sum_to_one_norm, SumToOneNormLayer);
-
-bool SumToOneNormLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-
-  return true;
-}
-
-void SumToOneNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-
-  /* malloc memory for the output_ if necessary */
-  size_t batchSize = inV->getHeight();
-  size_t dataDim = getSize();
-
-  CHECK_EQ(dataDim, inV->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwSumToOneNormTimer", getName().c_str());
-
-    Matrix::resizeOrCreate(reciprocalRowSum_, batchSize, 1, false, useGpu_);
-    inV->rowSum(*reciprocalRowSum_);
-
-    // todo: matrix checks
-    CHECK_GT(reciprocalRowSum_->getMin(), 0.0);
-
-    reciprocalRowSum_->scalarDiv(*reciprocalRowSum_, 1.0);
-
-    // outV = inV * reciprocalRowSum
-    outV->rowScale(0, *inV, *reciprocalRowSum_);
-  }
-}
-
-void SumToOneNormLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV->getHeight();
-
-  if (inG) {
-    REGISTER_TIMER_INFO("BwSumToOneTimer", getName().c_str());
-
-    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
-
-    // dotSum = outG .* outV
-    dotSum_->zeroMem();
-    dotSum_->rowDotMul(0, *outG, *outV);
-
-    // inG += -1 * (dotSum / rowSum)
-    dotSum_->dotMul(*dotSum_, *reciprocalRowSum_);
-    inG->rowAdd(0, *inG, *dotSum_, -1.0);
-    // inG += outG * (1/rowSum)
-    inG->addRowScale(0, *outG, *reciprocalRowSum_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h
deleted file mode 100644
index 5c1ee40ceda9387138a82368ec4edcbae4bd3419..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/TensorLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief TensorLayer takes two input vectors.
- * \f[
- *     y_{i} = x_{1} * W_{i} * x_{2}^{\rm T}, i=0, 1, ...,K-1
- * \f]
- *
- * - \f$x_{1}\f$: the first input, size is M.
- * - \f$x_{2}\f$: the second input, size is N.
- * - y: output, size is K.
- * - \f$y_{i}\f$: i-th element of y.
- * - \f$W_{i}\f$: the i-th learned weight, dimensions: [M, N].
- * - \f$x_{2}^{\rm T}\f$: the transpose of \f$x_{2}\f$.
- *
- * The config file api is tensor_layer.
- */
-
-class TensorLayer : public Layer {
- protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
deleted file mode 100644
index 1cd8fd91f785d5a43fc7d7663e657702b32fa534..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/TransLayer.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * A layer for transposing a minibatch matrix.
- * \f[
-     y = x^\mathrm{T}
- * \f]
- * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
- *
- * The config file api is trans_layer.
- */
-class TransLayer : public Layer {
- public:
-  explicit TransLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/UpsampleLayer.h b/paddle/gserver/layers/UpsampleLayer.h
deleted file mode 100644
index c9d079c3141c37517866bfdad10d9b2cdb89f7d5..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/UpsampleLayer.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * This layer transpose the pooling process.
- * It takes two input, the first input is the input data, and
- * the second is the mask data from the max-pool-with-mask layer.
- *
- */
-
-class UpsampleLayer : public Layer {
- public:
-  explicit UpsampleLayer(const LayerConfig& config) : Layer(config) {}
-  ~UpsampleLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  size_t getOutputSize();
-
- protected:
-  size_t scale_, scaleY_;
-  size_t upsampleSize_, upsampleSizeY_;
-  size_t padOutX_, padOutY_;
-  size_t imgSize_, imgSizeY_;
-  size_t channels_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ValidationLayer.h b/paddle/gserver/layers/ValidationLayer.h
deleted file mode 100644
index be41128ef4530f32a63c757648c2f393fd118ea6..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ValidationLayer.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-
-#include "Layer.h"
-#include "paddle/gserver/evaluators/Evaluator.h"
-
-DECLARE_int32(trainer_id);
-
-namespace paddle {
-
-class ValidationLayer : public Layer {
- public:
-  explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  LayerPtr getOutputLayer() { return inputLayers_[0]; }
-
-  LayerPtr getLabelLayer() { return inputLayers_[1]; }
-
-  LayerPtr getInfoLayer() {
-    assert(inputLayers_.size() > 2);
-    return inputLayers_[2];
-  }
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-  virtual void validationImp(MatrixPtr outputValue, IVectorPtr label) = 0;
-
-  void onPassEnd() override = 0;
-};
-
-/*
- * AucValidation
- */
-class AucValidation : public ValidationLayer {
- public:
-  explicit AucValidation(const LayerConfig& config)
-      : ValidationLayer(config),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
-
-  void onPassEnd() override;
-
-  struct PredictionResult {
-    PredictionResult(real __out, int __label) : out(__out), label(__label) {}
-    real out;
-    int label;
-  };
-  std::vector<PredictionResult> predictArray_;
-
- private:
-  bool passBegin_;
-  std::unique_ptr<Evaluator> evaluator_;
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-};
-
-/*
- * positive-negative pair rate Validation
- */
-class PnpairValidation : public ValidationLayer {
- public:
-  explicit PnpairValidation(const LayerConfig& config)
-      : ValidationLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
-
-  void onPassEnd() override;
-
- private:
-  bool passBegin_;
-  std::unique_ptr<Evaluator> evaluator_;
-};
-
-typedef std::shared_ptr<ValidationLayer> ValidationLayerPtr;
-}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
deleted file mode 100644
index 9d7cad7584d1defefe38bdd4d041b98bd9e45bf0..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/CMakeLists.txt
+++ /dev/null
@@ -1,103 +0,0 @@
-# gserver pacakge unittests
-add_simple_unittest(test_LinearChainCRF)
-add_simple_unittest(test_RecurrentLayer)
-
-if(NOT MOBILE_INFERENCE)
-  add_simple_unittest(test_MultinomialSampler)
-endif()
-
-function(gserver_test TARGET)
-  add_unittest_without_exec(${TARGET}
-      ${TARGET}.cpp
-      LayerGradUtil.cpp)
-  add_test(NAME ${TARGET}
-      COMMAND ${TARGET})
-endfunction()
-
-add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/concat_dotmul_a.conf
-    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
-)
-add_custom_target(copy_gserver_conf ALL DEPENDS concat_dotmul_a.conf)
-
-gserver_test(test_LayerGrad)
-gserver_test(test_CRFLayerGrad)
-gserver_test(test_CrossEntropyOverBeamGrad)
-gserver_test(test_SeqSliceLayerGrad)
-gserver_test(test_ActivationGrad)
-gserver_test(test_ConvTrans)
-gserver_test(test_PriorBox)
-gserver_test(test_DetectionOutput)
-gserver_test(test_ConvUnify)
-gserver_test(test_BatchNorm)
-gserver_test(test_KmaxSeqScore)
-gserver_test(test_Expand)
-gserver_test(test_MaxPoolingWithMaskOutput)
-gserver_test(test_Upsample)
-
-set(PYTHON_PATH 
-   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/gserver/tests)
-function(gserver_test_with_python TARGET)
-  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
-  add_test(NAME ${TARGET}
-    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
-endfunction()
-
-gserver_test_with_python(test_PyDataProvider2)
-if(WITH_PYTHON)
-    gserver_test_with_python(test_PyDataProvider)
-endif()
-if(NOT MOBILE_INFERENCE)
-    gserver_test_with_python(test_CompareTwoNets)
-    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
-    gserver_test_with_python(test_RecurrentGradientMachine)
-endif()
-
-########## test_MKLDNN layers and activations ##########
-if(WITH_MKLDNN)
-    add_unittest_without_exec(test_MKLDNN
-        test_MKLDNN.cpp
-        MKLDNNTester.cpp
-        LayerGradUtil.cpp)
-    add_test(NAME test_MKLDNN
-        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
-            WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
-endif()
-
-############### test_WarpCTCLayer #######################
-if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
-    add_unittest_without_exec(test_WarpCTCLayer
-        test_WarpCTCLayer.cpp)
-    add_test(NAME test_WarpCTCLayer
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
-        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
-endif()
-
-if(NOT MOBILE_INFERENCE)
-    ################## test_Evaluator #############
-    add_unittest(test_Evaluator
-        test_Evaluator.cpp)
-      
-    ########### test_NetworkCompare ###############
-    add_unittest_without_exec(test_NetworkCompare
-        test_NetworkCompare.cpp)
-    if(WITH_GPU)
-        set(use_gpu true)
-    else()
-        set(use_gpu false)
-    endif()
-    add_test(NAME test_NetworkCompare
-        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
-        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
-
-    ############ test_CompareSparse ################
-    add_unittest_without_exec(test_CompareSparse
-        test_CompareSparse.cpp)
-    if(NOT ON_TRAVIS)
-      add_test(NAME test_CompareSparse
-        COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 6
-                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
-    endif()
-endif()
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
deleted file mode 100644
index 1999b2204b1728bd60b1e107dfe7b10718e752a5..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ /dev/null
@@ -1,329 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-
-#include "paddle/testing/TestUtil.h"
-using namespace std;  // NOLINT
-
-namespace paddle {
-enum InputType {
-  INPUT_DATA,         // dense vector
-  INPUT_LABEL,        // id
-  INPUT_DATA_TARGET,  // dense vector, but no gradient
-  INPUT_SEQUENCE_DATA,
-  INPUT_HASSUB_SEQUENCE_DATA,  // sequence has sub-sequence
-  INPUT_SEQUENCE_MDIM_DATA,
-  INPUT_SEQUENCE_LABEL,
-  INPUT_SPARSE_NON_VALUE_DATA,
-  INPUT_SPARSE_FLOAT_VALUE_DATA,
-  INPUT_DENSE_DIM_DATA,    // using sequence length to init dense data
-  INPUT_SELF_DEFINE_DATA,  // support customizing for input value
-};
-
-struct ParaSparse {
-  bool sparse;
-  string format;
-  // if equalNnzPerSample is set true,
-  // every row of the sparse matrix in a format of CSR has a same
-  // number of nnz values. Currently, this flag is only used for
-  // selective_fc layer
-  bool equalNnzPerSample;
-  ParaSparse(const string& formatIn = "") {  // NOLINT
-    if (formatIn == "") {
-      sparse = false;
-    } else {
-      sparse = true;
-    }
-    equalNnzPerSample = false;
-  }
-  ParaSparse(const string& formatIn, bool equalNnz) {
-    format = formatIn;
-    sparse = true;
-    equalNnzPerSample = equalNnz;
-  }
-};
-
-struct InputDef {
-  InputType inputType;
-  string name;
-  size_t dim;
-  size_t paraSize;
-  ParaSparse sparse;
-  bool isStatic;
-  std::vector<int> labelInitValue;
-  std::vector<int> labelSeqStartPositions;
-  std::vector<int> labelSubSeqStartPositions;
-  std::vector<int> ids;
-  MatrixPtr selfDefinedData;
-
-  InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = {""};
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           MatrixPtr selfDefinedData,
-           std::vector<int> selfDefinedSeqStartPos = {},
-           std::vector<int> selfDefinedSubSeqStartPos = {})
-      : labelSeqStartPositions(selfDefinedSeqStartPos),
-        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
-        selfDefinedData(selfDefinedData) {
-    inputType = type;
-    name = nameIn;
-    dim = 0;
-    sparse = {""};
-    paraSize = 0;
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           const std::vector<int>& ids,
-           const std::vector<int>& selfDefinedSeqStartPos = {},
-           const std::vector<int>& selfDefinedSubSeqStartPos = {})
-      : labelSeqStartPositions(selfDefinedSeqStartPos),
-        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
-        ids(ids) {
-    selfDefinedData = nullptr;
-    inputType = type;
-    name = nameIn;
-    dim = 0;
-    sparse = {""};
-    paraSize = 0;
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           size_t dimIn,
-           size_t sizeIn,
-           const std::vector<int>& labelInitValue,
-           const std::vector<int>& labelSeqStartPositions)
-      : labelInitValue(labelInitValue),
-        labelSeqStartPositions(labelSeqStartPositions) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = {""};
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           size_t dimIn,
-           size_t sizeIn,
-           ParaSparse sparseIn) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = sparseIn;
-  }
-};
-
-struct TestConfig {
-  LayerConfig layerConfig;
-  std::vector<InputDef> inputDefs;
-  size_t biasSize;
-  real paramInitialMean;
-  real paramInitialStd;
-  bool testAccumulate;
-  bool testState;
-  bool staticBias;
-  bool testBatchState;
-  TestConfig()
-      : biasSize(0),
-        paramInitialMean(0.0),
-        paramInitialStd(1.0),
-        testAccumulate(true),
-        testState(false),
-        staticBias(false),
-        testBatchState(false) {}
-};
-
-real getCostSum(ParameterPtr& parameter,
-                CpuVector& cpuPara,
-                LayerPtr& testLayer,
-                MatrixPtr weights = nullptr);
-
-real getDiffAndPrint(real newCost1,
-                     real newCost2,
-                     real callbackCount,
-                     char fill,
-                     string testLayerName,
-                     string name,
-                     real step,
-                     real delta);
-
-/**
- * @brief verify that sequentially running forward() one timestamp at one time
- *        has same result as running forward() with one whole sequence
- *
- * @param testLayer[in/out]    testLayer
- * @param dataLayers[in/out]   dataLayers
- * @param datas[in/out]        data of dataLayers
- */
-void testState(LayerPtr testLayer,
-               vector<DataLayerPtr>& dataLayers,
-               vector<Argument>& datas);
-
-/**
- * @brief verify that sequentially running forward() with short sequences one
- *        time has same result as running forward() with long sequences.
- *
- * @param testLayer[in/out]    testLayer
- * @param dataLayers[in/out]   dataLayers
- * @param datas[in/out]        data of dataLayers
- */
-void testBatchState(LayerPtr testLayer,
-                    vector<DataLayerPtr>& dataLayers,
-                    vector<Argument>& datas);
-
-/**
- * @brief Generate a perturbation so that it is roughly aligned with the
- *        gradient direction. This is to make sure that change along this
- *        direction will make cost increase (or decrease) in a meaningful
- *        way so that the finite difference can be used to approximate the
- *        directional dirivative well.
- *
- * @param oldGrad[in]  input gradient
- *        newGrad[out] output gradient
- *        dim          dimension of oldGrad/newGrad
- *
- * @return sum_i(oldGrad[i] * newGrad[i])
- */
-double genPerturbation(const real* oldGrad, real* newGrad, size_t dim);
-
-void initWeight(MatrixPtr& weights);
-
-void initBatchState(LayerPtr dataLayer,
-                    LayerPtr testLayer,
-                    LayerStatePtr state,
-                    bool useGpu);
-
-/**
- * @brief initialize the dataLayer by its inputType
- *
- * @param testConf[in]        test config
- *        dataLayers[out]     dataLayers
- *        datas[out]          initialized data of dataLayers
- *        layerMap[out]       layerMap
- */
-void initDataLayer(TestConfig testConf,
-                   std::vector<DataLayerPtr>* dataLayers,
-                   vector<Argument>* datas,
-                   LayerMap* layerMap,
-                   string testLayerName,
-                   size_t batchSize,
-                   bool trans,
-                   bool useGpu);
-
-/**
- * @brief initialize the parameter of testLayer
- *
- * @param testConf[in/out]    test config
- *        layerMap[out]       layerMap
- *        parameters[out]     parameters of testLayer
- *        testLayer[out]      testLayer
- */
-void initTestLayer(TestConfig testConf,
-                   LayerMap* layerMap,
-                   std::vector<ParameterPtr>* parameters,
-                   LayerPtr* testLayer);
-
-/**
- * @brief Test whether the layer's forward calculation is stable by adding
- *        perturbation to its parameters
- *
- * @param testConf[in]         test config
- *        weights[in]          weights of testLayer
- *        state[in]            state of testLayer
- *        cost[in]             input cost
- *        callbackCount[in]    number of done callback
- *        maxDiff[in/out]      max of all previous diff
- *        testLayer[in/out]    testLayer
- *        parameters[in/out]   parameters of testLayer
- */
-void testPerturbParameter(TestConfig testConf,
-                          const MatrixPtr weights,
-                          const LayerStatePtr state,
-                          real cost,
-                          real callbackCount,
-                          real* maxDiff,
-                          LayerPtr testLayer,
-                          std::vector<ParameterPtr>* parameters);
-
-/**
- * @brief Test whether the layer's forward calculation is stable by adding
- *        perturbation to its input layers
- *
- * @param testConf[in]         test config
- *        weights[in]          weights of testLayer
- *        state[in]            state of testLayer
- *        cost[in]             input cost
- *        callbackCount[in]    number of done callback
- *        maxDiff[in/out]      max of all previous diff
- *        testLayer[in/out]    testLayer
- *        dataLayers[in/out]   dataLayers
- */
-void testPerturbInput(TestConfig testConf,
-                      const MatrixPtr weights,
-                      const LayerStatePtr state,
-                      real cost,
-                      real callbackCount,
-                      real* maxDiff,
-                      LayerPtr testLayer,
-                      std::vector<DataLayerPtr> dataLayers);
-
-void testLayerGradKernel(TestConfig testConf,
-                         string testLayerName,
-                         size_t batchSize,
-                         bool trans,
-                         bool useGpu,
-                         bool useWeight = false,
-                         float epsilon = 0.02);
-
-void testLayerGrad(TestConfig testConf,
-                   string testLayerName,
-                   size_t batchSize,
-                   bool trans,
-                   bool useGpu,
-                   bool useWeight = false,
-                   float epsilon = 0.02);
-
-void testProjectionGrad(ProjectionConfig conf,
-                        InputType inputType,
-                        size_t parameterSize,
-                        size_t batchSize,
-                        bool useGpu,
-                        bool testState = false,
-                        int biasSize = 0,
-                        bool sharedBias = false);
-
-void testOperatorGrad(TestConfig& config,
-                      OperatorConfig& operatorConf,
-                      size_t batchSize,
-                      bool useGpu,
-                      bool testState = false);
-
-}  //  namespace paddle
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
deleted file mode 100644
index d2a9761a4e16832a0722d4375cc11adb42524a8c..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ /dev/null
@@ -1,580 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNTester.h"
-#include "paddle/gserver/layers/MKLDNNBase.h"
-#include "paddle/gserver/layers/MKLDNNLayer.h"
-#include "paddle/trainer/Trainer.h"
-
-namespace paddle {
-
-// init data layer and test layer of both dnn and reference
-void MKLDNNTester::reset(const TestConfig& dnn,
-                         const TestConfig& ref,
-                         size_t batchSize) {
-  const bool trans = false;
-  const bool useGpu = false;
-
-  // clear
-  configs_.clear();
-  layerNames_.clear();
-  dataLayers_.clear();
-  datas_.clear();
-  layerMaps_.clear();
-  parameters_.clear();
-  testLayers_.clear();
-
-  // resize
-  configs_.resize(NUM);
-  layerNames_.resize(NUM);
-  dataLayers_.resize(NUM);
-  datas_.resize(NUM);
-  layerMaps_.resize(NUM);
-  parameters_.resize(NUM);
-  testLayers_.resize(NUM);
-
-  // reset configs and layer names
-  configs_[DNN] = dnn;
-  configs_[REF] = ref;
-  layerNames_[DNN] = "mkldnn";     // the first is mkldnn layer
-  layerNames_[REF] = "reference";  // second is reference layer
-
-  // reset others
-  for (size_t i = 0; i < NUM; ++i) {
-    configs_[i].layerConfig.set_name(layerNames_[i]);
-    initDataLayer(configs_[i],
-                  &(dataLayers_[i]),
-                  &(datas_[i]),
-                  &(layerMaps_[i]),
-                  layerNames_[i],
-                  batchSize,
-                  trans,
-                  useGpu);
-    initTestLayer(
-        configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
-  }
-  refLayer_ = testLayers_[REF];
-  dnnLayer_ = testLayers_[DNN];
-  EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
-  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  setInputImgSize();
-
-  // for comparison with Paddle reference results,
-  // need manually add cpu device output for test
-  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
-  if (dnnLayer) {
-    dnnLayer->addOutputArgument(CPU_DEVICE);
-  }
-}
-
-void MKLDNNTester::setInputImgSize() {
-  for (size_t n = 0; n < dataLayers_.size(); ++n) {
-    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
-      // TODO(TJ): fix me when concat and elewise ready
-      dataLayers_[n][i]->getOutput().setFrameHeight(ih_);
-      dataLayers_[n][i]->getOutput().setFrameWidth(iw_);
-    }
-  }
-}
-
-// init randome parameters of ref, and copy to mkldnn
-void MKLDNNTester::randomWgtDatas() {
-  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  const bool isBN = refLayer_->getType() == "batch_norm";
-  for (size_t i = 0; i < parameters_[REF].size(); ++i) {
-    const VectorPtr& dnnValue = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
-    const VectorPtr& refValue = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    parameters_[REF][i]->randomize();
-    if (isBN && i == 2) {
-      // this param is moving average in batch norm, which must larger than 0
-      real offset = fabs(refValue->getMin()) + 1.0;
-      refValue->add(offset);
-    }
-    dnnValue->copyFrom(*refValue);
-
-    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
-    printVector(dnnValue);
-  }
-}
-
-// random botdata of ref layer and copy same to mkldnn
-void MKLDNNTester::randomBotDatas() {
-  CHECK_EQ(dataLayers_.size(), NUM);
-  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
-    dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
-    dataLayers_[DNN][i]->getOutputValue()->copyFrom(
-        *(dataLayers_[REF][i]->getOutputValue()));
-    VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
-    printMatrix(dataLayers_[REF][i]->getOutputValue());
-  }
-}
-
-void MKLDNNTester::randomTopDiffs() {
-  refLayer_->getOutputGrad()->randomizeUniform();
-  dnnLayer_->getOutput(CPU_DEVICE)
-      .grad->copyFrom(*(refLayer_->getOutputGrad()));
-  VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
-  printMatrix(refLayer_->getOutputGrad());
-}
-
-void MKLDNNTester::checkForward() {
-  VLOG(MKLDNN_TESTS) << "Check Forward";
-  printTopDatas();
-  double delta =
-      compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
-  EXPECT_LE(fabs(delta), eps_);
-}
-
-void MKLDNNTester::checkBackwardData() {
-  VLOG(MKLDNN_TESTS) << "Check Backward Data";
-  const bool isBN = refLayer_->getType() == "batch_norm";
-  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
-    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
-    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
-    VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
-    printMatrix(dnnDiff);
-    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
-    printMatrix(refDiff);
-
-    double delta = compareMatrix(refDiff, dnnDiff);
-    EXPECT_LE(fabs(delta), eps_);
-    if (isBN) {
-      // the other two inputs in batch norm are for moving mean and var
-      // do not have grad to compare
-      break;
-    }
-  }
-}
-
-void MKLDNNTester::checkBackwardWgts() {
-  VLOG(MKLDNN_TESTS) << "Check Backward Weight";
-  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
-  saveWgt(parameters_[DNN], dnnWgts);
-
-  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
-  if (dnnLayer) {
-    dnnLayer->convertWeightsToPaddle();
-  }
-  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
-    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
-    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
-                     << parameters_[DNN][i]->getName();
-    printVector(dnn);
-    VLOG(MKLDNN_ALL) << "Reference Result: weight value "
-                     << parameters_[REF][i]->getName();
-    printVector(ref);
-
-    double delta = compareVector(ref, dnn);
-    EXPECT_LE(fabs(delta), eps_);
-  }
-
-  VLOG(MKLDNN_ALL) << "Restore dnn weights before comapre";
-  restoreWgt(dnnWgts, parameters_[DNN]);
-}
-
-void MKLDNNTester::saveWgt(const vector<ParameterPtr>& from,
-                           vector<VectorPtr>& to) {
-  const bool useGpu = false;
-  to.resize(from.size());
-  for (size_t i = 0; i < to.size(); ++i) {
-    const VectorPtr& wgt = from[i]->getBuf(PARAMETER_VALUE);
-    to[i] = Vector::create(wgt->getSize(), useGpu);
-    to[i]->copyFrom(*wgt);
-  }
-}
-
-void MKLDNNTester::restoreWgt(const vector<VectorPtr>& from,
-                              vector<ParameterPtr>& to) {
-  CHECK_EQ(from.size(), to.size());
-  for (size_t i = 0; i < from.size(); ++i) {
-    const VectorPtr& wgt = to[i]->getBuf(PARAMETER_VALUE);
-    wgt->copyFrom(*from[i]);
-  }
-}
-
-// clear parameters grad
-void MKLDNNTester::clearWgtDiffs(size_t id) {
-  CHECK_LE(id, parameters_.size());
-  for (size_t n = 0; n < parameters_.size(); ++n) {
-    if (id == n || id == parameters_.size()) {
-      for (size_t i = 0; i < parameters_[n].size(); ++i) {
-        const VectorPtr& grad = parameters_[n][i]->getBuf(PARAMETER_GRADIENT);
-        if (grad) {
-          grad->zeroMem();
-        }
-      }
-    }
-  }
-}
-
-void MKLDNNTester::clearBotDiffs(size_t id) {
-  CHECK_LE(id, dataLayers_.size());
-  for (size_t n = 0; n < dataLayers_.size(); ++n) {
-    if (id == n || id == dataLayers_.size()) {
-      // clear inputs layers of this specific layer
-      for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
-        dataLayers_[n][i]->getOutputGrad()->zeroMem();
-      }
-    }
-  }
-}
-
-void MKLDNNTester::clearTopDatas(size_t id) {
-  CHECK_LE(id, testLayers_.size());
-  for (size_t i = 0; i < testLayers_.size(); ++i) {
-    if (id == i || id == testLayers_.size()) {
-      testLayers_[i]->getOutputValue()->zeroMem();
-    }
-  }
-}
-
-void MKLDNNTester::printTopDatas() {
-  if (!log_) {
-    return;
-  }
-
-  for (int n = 0; n < NUM; ++n) {
-    VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
-                     << " Forward Result: OutputValue";
-    printMatrix(testLayers_[n]->getOutputValue());
-  }
-}
-
-void MKLDNNTester::printMatrix(const MatrixPtr& m) {
-  if (!log_) {
-    return;
-  }
-
-  std::ostringstream ostr;
-  m->print(ostr);
-  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
-}
-
-void MKLDNNTester::printVector(const VectorPtr& v) {
-  if (!log_) {
-    return;
-  }
-
-  std::ostringstream ostr;
-  v->print(ostr, v->getSize());
-  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
-}
-
-double MKLDNNTester::getDelta(const real* refer,
-                              const real* value,
-                              size_t len,
-                              const float failRate,
-                              const float thres) {
-  double delta = 0, sum = 0;
-  int failCnt = 0;
-  const double eps = 1e-5;
-  double maxRatio = 0;
-  for (size_t i = 0; i < len; ++i) {
-    double ref = fabs(refer[i]);
-    double val = fabs(value[i]);
-    double diff = fabs(refer[i] - value[i]);
-    delta += diff;
-    sum += ref;
-    if (ref < eps && val < eps) {  // both values are very small
-      continue;
-    }
-    double ratio = diff / ref;
-    if (ratio > thres) {
-      maxRatio = std::max(maxRatio, ratio);
-      failCnt++;
-    }
-  }
-  EXPECT_FALSE(std::isinf(sum));
-  EXPECT_FALSE(std::isnan(sum));
-  EXPECT_FALSE(std::isnan(delta));
-  VLOG(MKLDNN_ALL) << "reference avg data: " << sum / len
-                   << ", delta: " << delta / sum << ", failCnt:" << failCnt;
-  double res = sum > eps ? delta / sum : eps;
-  return (failCnt / (float)len) > failRate ? maxRatio : res;
-}
-
-double MKLDNNTester::compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2) {
-  CHECK_EQ(m1->getElementCnt(), m2->getElementCnt());
-  return getDelta(m1->getData(), m2->getData(), m1->getElementCnt());
-}
-
-double MKLDNNTester::compareVector(const VectorPtr& v1, const VectorPtr& v2) {
-  CHECK_EQ(v1->getSize(), v2->getSize());
-  return getDelta(v1->getData(), v2->getData(), v1->getSize());
-}
-
-void MKLDNNTester::runOnce() {
-  // test forward
-  randomBotDatas();
-  dnnLayer_->forward(passType_);
-  refLayer_->forward(passType_);
-  checkForward();
-
-  if (passType_ == PASS_TEST) {
-    return;
-  }
-
-  // test backward
-  // simple updater
-  UpdateCallback updateCallback = [](Parameter* para) {
-    auto& grad = para->getBuf(PARAMETER_GRADIENT);
-    auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-2;
-    value->add(*grad, lr);
-    grad->zeroMem();
-  };
-  randomTopDiffs();
-  dnnLayer_->backward(updateCallback);
-  refLayer_->backward(updateCallback);
-  checkBackwardData();
-  checkBackwardWgts();
-
-  // clear buffers
-  // ref code will addto the diff, dnn code will writeto it
-  // and clearTopDatas(REF) should be coverd by ref layers
-  clearBotDiffs(REF);
-  clearWgtDiffs(REF);
-  // it is necessary to clear bottom diffs when only activation is dnn type
-  if (configs_[DNN].layerConfig.active_type().compare(0, 7, "mkldnn_") == 0) {
-    clearBotDiffs(DNN);
-  }
-}
-
-void MKLDNNTester::run(const TestConfig& dnn,
-                       const TestConfig& ref,
-                       size_t batchSize,
-                       size_t inputImgH,
-                       size_t inputImgW,
-                       PassType passType,
-                       bool printDetails,
-                       size_t iter,
-                       float epsilon) {
-  CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
-        dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
-      << "should be MKLDNN layer or MKLDNN activation";
-  if (dnn.layerConfig.type() == ref.layerConfig.type()) {
-    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
-                       << dnn.layerConfig.active_type() << " vs "
-                       << ref.layerConfig.active_type();
-  } else {
-    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
-                       << dnn.layerConfig.type() << " vs "
-                       << ref.layerConfig.type();
-  }
-
-  ih_ = inputImgH;
-  iw_ = inputImgW;
-  passType_ = passType;
-  log_ = printDetails;
-  iter_ = iter;
-  eps_ = epsilon;
-
-  // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
-  reset(dnn, ref, batchSize);
-  randomWgtDatas();
-  clearWgtDiffs();
-  clearBotDiffs();
-  for (size_t i = 0; i < iter_; ++i) {
-    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
-    runOnce();
-  }
-
-  if (parameters_[DNN].empty()) {
-    // has no paramters
-    return;
-  }
-
-  // After run some iterations, the mkldnn weight has been stored in dnnLayer
-  // and we can also get the mkldnn weight parameter header format.
-  // Weight parameter should always be index 0 (and bias index 1).
-  // TODO(TJ): should also consider mean and var format when batchnorm ready
-  int dnnWgtFmt = parameters_[DNN][0]->getHeaderFormat();
-  int refWgtFmt = parameters_[REF][0]->getHeaderFormat();
-  if (dnnWgtFmt == refWgtFmt) {
-    // weight format are equal, so no need check more
-    return;
-  }
-
-  // then save the weights and restart again
-  vector<VectorPtr> dnnWgts, refWgts;
-  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  saveWgt(parameters_[DNN], dnnWgts);
-  saveWgt(parameters_[REF], refWgts);
-
-  // restart again with dnn weight format
-  reset(dnn, ref, batchSize);
-  // TODO(TJ): should also considerate mean and var format when batchnorm ready
-  parameters_[DNN][0]->setHeaderFormat(dnnWgtFmt);
-
-  // restore wgt
-  restoreWgt(dnnWgts, parameters_[DNN]);
-  restoreWgt(refWgts, parameters_[REF]);
-  clearWgtDiffs();
-  clearBotDiffs();
-
-  for (size_t i = 0; i < iter_; ++i) {
-    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
-    runOnce();
-  }
-}
-
-void MKLDNNTester::initArgument(DataIn& data,
-                                const std::string& configPath,
-                                const size_t iter) {
-  TrainerConfigHelper config(configPath);
-  size_t batchSize = config.getOptConfig().batch_size();
-  data.inArgs.resize(iter);
-  data.outGrads.resize(iter);
-  data.paraValues.clear();
-  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    for (size_t i = 0; i < iter; ++i) {
-      Argument arg;
-      arg.value = Matrix::create(batchSize, layerSize, false, false);
-      arg.grad = Matrix::create(batchSize, layerSize, false, false);
-      arg.value->randomizeUniform();
-      arg.value->add(-0.5);
-      arg.value->sigmoid(*arg.value);
-      arg.grad->zeroMem();
-      arg.ids = VectorT<int>::create(batchSize, false);
-      arg.ids->rand(layerSize);
-      generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
-      data.inArgs[i].push_back(arg);
-    }
-  }
-
-  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    for (size_t i = 0; i < iter; ++i) {
-      MatrixPtr grad = Matrix::create(batchSize, layerSize, false, false);
-      grad->randomizeUniform();
-      data.outGrads[i].push_back(grad);
-    }
-  }
-
-  for (const auto& para_config : config.getModelConfig().parameters()) {
-    VectorPtr value = Vector::create(para_config.size(), false);
-    value->randnorm(0, 2);
-    data.paraValues.push_back(value);
-  }
-}
-
-void MKLDNNTester::getOutResult(const std::string& configPath,
-                                DataIn& in,
-                                DataOut& out,
-                                bool use_mkldnn,
-                                size_t iter) {
-  FLAGS_use_gpu = false;
-  FLAGS_use_mkldnn = use_mkldnn;
-  *ThreadLocalRand::getSeed() = 1;
-  srand(1);
-
-  Trainer trainer;
-  auto config = std::make_shared<TrainerConfigHelper>(configPath);
-  trainer.init(config, false);
-  auto gradientMachine = trainer.getGradientMachine();
-  std::vector<ParameterPtr> parameters = gradientMachine->getParameters();
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
-  }
-  UpdateCallback simpleUpdate = [](Parameter* para) {
-    auto& grad = para->getBuf(PARAMETER_GRADIENT);
-    auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-2;
-    value->add(*grad, lr);
-    grad->zeroMem();
-  };
-
-  vector<Argument> outArgs;
-  gradientMachine->start();
-  out.outValues.clear();
-  out.paraValues.clear();
-  for (size_t i = 0; i < iter; ++i) {
-    VLOG(MKLDNN_TESTS) << "runing iteration " << i;
-    gradientMachine->forward(in.inArgs[i], &outArgs, PASS_TRAIN);
-    // save forward result
-    for (size_t k = 0; k < outArgs.size(); k++) {
-      const MatrixPtr& src = outArgs[k].value;
-      MatrixPtr dst =
-          Matrix::create(src->getHeight(), src->getWidth(), false, false);
-      if (typeid(*src) == typeid(MKLDNNMatrix)) {
-        MKLDNNMatrixPtr dnnSrc = std::dynamic_pointer_cast<MKLDNNMatrix>(src);
-        dnnSrc->copyTo(*dst);
-      } else {
-        dst->copyFrom(*src);
-      }
-      out.outValues.push_back(dst);
-    }
-
-    // random backward input
-    for (size_t k = 0; k < outArgs.size(); k++) {
-      outArgs[k].grad->copyFrom(*in.outGrads[i][k]);
-    }
-    gradientMachine->backward(simpleUpdate);
-  }
-  gradientMachine->finish();
-
-  // save param value
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    VectorPtr val = Vector::create(
-        parameters[i]->getBuf(PARAMETER_VALUE)->getSize(), false);
-    val->copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
-    out.paraValues.push_back(val);
-  }
-}
-
-void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
-  CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
-  CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
-  for (size_t i = 0; i < ref.outValues.size(); i++) {
-    VLOG(MKLDNN_TESTS) << "compare value index: " << i;
-    EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
-  }
-  for (size_t i = 0; i < ref.paraValues.size(); i++) {
-    VLOG(MKLDNN_TESTS) << "compare param index: " << i;
-    EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
-  }
-}
-
-void MKLDNNTester::runNetTest(const std::string& configPath,
-                              size_t iter,
-                              float eps) {
-  DataIn in;
-  initArgument(in, configPath, iter);
-  DataOut outCpu, outDnn;
-  VLOG(MKLDNN_TESTS) << "runing cpu network";
-  getOutResult(configPath, in, outCpu, false, iter);
-  VLOG(MKLDNN_TESTS) << "runing mkldnn network";
-  getOutResult(configPath, in, outDnn, true, iter);
-
-  compareResult(outCpu, outDnn, eps);
-}
-
-}  //  namespace paddle
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
deleted file mode 100644
index 41ac46b70ab08d4071f4e6abfca94667268015d7..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "LayerGradUtil.h"
-#include "paddle/gserver/layers/MKLDNNBase.h"
-#include "paddle/gserver/layers/MKLDNNLayer.h"
-
-namespace paddle {
-
-/**
- * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
- * refer to paddle original function
- */
-class MKLDNNTester {
-  enum {
-    DNN = 0,  // MKLDNN layer
-    REF = 1,  // Reference layer
-    NUM = 2,  // Number of total
-  };
-
-  struct DataIn {
-    std::vector<std::vector<Argument>> inArgs;
-    std::vector<std::vector<MatrixPtr>> outGrads;
-    std::vector<VectorPtr> paraValues;
-  };
-
-  struct DataOut {
-    std::vector<MatrixPtr> outValues;
-    std::vector<VectorPtr> paraValues;
-  };
-
- protected:
-  std::vector<TestConfig> configs_;
-  vector<string> layerNames_;
-  vector<vector<DataLayerPtr>> dataLayers_;
-  vector<vector<Argument>> datas_;
-  vector<LayerMap> layerMaps_;
-  vector<vector<ParameterPtr>> parameters_;
-  vector<LayerPtr> testLayers_;
-  LayerPtr refLayer_, dnnLayer_;
-
-  /// run some iterations, all the result should pass
-  size_t iter_;
-  /// whether to print out the details
-  bool log_;
-  /// epsilon
-  float eps_;
-  /// input image size, default 1
-  size_t ih_, iw_;
-  /// passType, PASS_TRAIN, PASS_TEST or PASS_GC (Gradient Check pass)
-  PassType passType_;
-
- public:
-  explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
-    iter_ = iter;
-    eps_ = epsilon;
-    log_ = false;
-    passType_ = PASS_TRAIN;
-  }
-
-  ~MKLDNNTester() {}
-
- public:
-  void run(const TestConfig& dnn,
-           const TestConfig& ref,
-           size_t batchSize,
-           size_t inputImgH = 1,
-           size_t inputImgW = 1,
-           PassType passType = PASS_TRAIN,
-           bool printDetails = false,
-           size_t iter = 3,
-           float epsilon = 1e-4);
-  static void runNetTest(const std::string& configPath,
-                         size_t iter = 2,
-                         float eps = 1e-4);
-  static void initArgument(DataIn& data,
-                           const std::string& configPath,
-                           size_t iter = 2);
-  static void getOutResult(const std::string& configPath,
-                           DataIn& in,
-                           DataOut& out,
-                           bool use_mkldnn,
-                           size_t iter = 2);
-
- private:
-  void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
-  void setInputImgSize();
-  void runOnce();
-
-  void randomWgtDatas();
-  void randomBotDatas();
-  void randomTopDiffs();
-
-  void checkForward();
-  void checkBackwardData();
-  void checkBackwardWgts();
-
-  // clear specific layer, clear all when id equals NUM
-  void clearWgtDiffs(size_t id = NUM);
-  void clearBotDiffs(size_t id = NUM);
-  void clearTopDatas(size_t id = NUM);
-
-  void printTopDatas();
-  void printMatrix(const MatrixPtr& m);
-  void printVector(const VectorPtr& v);
-
-  void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
-  void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
-
-  static double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
-  static double compareVector(const VectorPtr& v1, const VectorPtr& v2);
-  static void compareResult(DataOut& ref, DataOut& dnn, float eps = 1e-4);
-
-  /**
-   * Get delta percent
-   * if many(>failRate) wrong(abs(val-ref)/abs(ref) > thres) points
-   * return the max(diff/ref)
-   * else return sum(abs(diff)) / sum(abs(ref))
-   * The return value should be smaller than eps when passing.
-   */
-  static double getDelta(const real* refer,
-                         const real* value,
-                         size_t len,
-                         const float failRate = 1e-3,
-                         const float thres = 0.1);
-};
-
-}  //  namespace paddle
diff --git a/paddle/gserver/tests/Sequence/train.list b/paddle/gserver/tests/Sequence/train.list
deleted file mode 100644
index be27acb3a5411d8fe65797079a9a5977c1f0f90a..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/Sequence/train.list
+++ /dev/null
@@ -1 +0,0 @@
-gserver/tests/Sequence/tour_train_wdseg
diff --git a/paddle/gserver/tests/Sequence/train.list.nest b/paddle/gserver/tests/Sequence/train.list.nest
deleted file mode 100644
index 7683ebc68efbb07ce01d8faab14574109df99af9..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/Sequence/train.list.nest
+++ /dev/null
@@ -1 +0,0 @@
-gserver/tests/Sequence/tour_train_wdseg.nest
diff --git a/paddle/gserver/tests/sequence_layer_group.conf b/paddle/gserver/tests/sequence_layer_group.conf
deleted file mode 100644
index 50f2d89d0271b2eaa460e57636eb09b6d6aeda18..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_layer_group.conf
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# (lstm_input + lstm) is equal to lstmemory 
-with mixed_layer(size=hidden_dim * 4) as lstm_input:
-    lstm_input += full_matrix_projection(input=emb)
-
-lstm = lstmemory_group(
-    input=lstm_input,
-    size=hidden_dim,
-    act=TanhActivation(),
-    gate_act=SigmoidActivation(),
-    state_act=TanhActivation())
-
-lstm_last = last_seq(input=lstm)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_lstm.conf b/paddle/gserver/tests/sequence_lstm.conf
deleted file mode 100644
index f49a827f22edce056eaf9903e99b732cab7f3784..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_lstm.conf
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data,
-    size=word_dim,
-    param_attr=ParamAttr(sparse_update=sparse_update))
-
-with mixed_layer(size=hidden_dim * 4) as lstm_input:
-    lstm_input += full_matrix_projection(input=emb)
-
-lstm = lstmemory(
-    input=lstm_input,
-    act=TanhActivation(),
-    gate_act=SigmoidActivation(),
-    state_act=TanhActivation())
-
-lstm_last = last_seq(input=lstm)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_nest_layer_group.conf b/paddle/gserver/tests/sequence_nest_layer_group.conf
deleted file mode 100644
index 71ef53d08a2cea070806afb2c65ef15c4dd28f31..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_nest_layer_group.conf
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list.nest',
-    test_list=None,
-    module='sequenceGen',
-    obj='process2',
-    args={"dict_file": dict_file})
-
-settings(batch_size=2)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb_group = embedding_layer(input=data, size=word_dim)
-
-
-# (lstm_input + lstm) is equal to lstmemory 
-def lstm_group(lstm_group_input):
-    with mixed_layer(size=hidden_dim * 4) as group_input:
-        group_input += full_matrix_projection(input=lstm_group_input)
-
-    lstm_output = lstmemory_group(
-        input=group_input,
-        name="lstm_group",
-        size=hidden_dim,
-        act=TanhActivation(),
-        gate_act=SigmoidActivation(),
-        state_act=TanhActivation())
-    return lstm_output
-
-
-lstm_nest_group = recurrent_group(
-    input=SubsequenceInput(emb_group), step=lstm_group, name="lstm_nest_group")
-# hasSubseq ->(seqlastins) seq
-lstm_last = last_seq(
-    input=lstm_nest_group, agg_level=AggregateLevel.TO_SEQUENCE)
-
-# seq ->(expand) hasSubseq
-lstm_expand = expand_layer(
-    input=lstm_last,
-    expand_as=emb_group,
-    expand_level=ExpandLevel.FROM_SEQUENCE)
-
-# hasSubseq ->(average) seq
-lstm_average = pooling_layer(
-    input=lstm_expand,
-    pooling_type=AvgPooling(),
-    agg_level=AggregateLevel.TO_SEQUENCE)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_average)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_nest_rnn.conf b/paddle/gserver/tests/sequence_nest_rnn.conf
deleted file mode 100644
index 2873a599669b4281a53cd71e8bb56f0d18c26b5a..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_nest_rnn.conf
+++ /dev/null
@@ -1,74 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_subseq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn.conf
-
-def outer_step(x):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-    def inner_step(y):
-        inner_mem = memory(name="inner_rnn_state",
-                           size=hidden_dim,
-                           boot_layer=outer_mem)
-        out = fc_layer(input=[y, inner_mem],
-                        size=hidden_dim,
-                        act=TanhActivation(),
-                        bias_attr=True,
-                        name="inner_rnn_state")
-        return out
-
-    inner_rnn_output = recurrent_group(
-        step=inner_step,
-        name="inner",
-        input=x)
-    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
-
-    # "return last" won't work, because recurrent_group only support the input 
-    # sequence type is same as return sequence type.
-    return inner_rnn_output
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=SubsequenceInput(emb))
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
deleted file mode 100644
index afdacfffd7aecfe2f4762f04a987126381bcea34..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ /dev/null
@@ -1,76 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_subseq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn.conf
-
-def outer_step(wid, x):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-    def inner_step(y, wid):
-        z = embedding_layer(input=wid, size=word_dim)
-        inner_mem = memory(name="inner_rnn_state",
-                           size=hidden_dim,
-                           boot_layer=outer_mem)
-        out = fc_layer(input=[y, z, inner_mem],
-                        size=hidden_dim,
-                        act=TanhActivation(),
-                        bias_attr=True,
-                        name="inner_rnn_state")
-        return out
-
-    inner_rnn_output = recurrent_group(
-        step=inner_step,
-        name="inner",
-        input=[x, wid])
-    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
-
-    # "return last" should also work. But currently RecurrentGradientMachine
-    # does not handle it, and will report error: In hierachical RNN, all out
-    # links should be from sequences now.
-    return inner_rnn_output
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[SubsequenceInput(data), SubsequenceInput(emb)])
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py b/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
deleted file mode 100644
index 569d3c094b6f5517dad0f1e04f98de12aaef9633..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_unequalength_subseq')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 2
-
-speaker1 = data_layer(name="word1", size=dict_dim)
-speaker2 = data_layer(name="word2", size=dict_dim)
-
-emb1 = embedding_layer(input=speaker1, size=word_dim)
-emb2 = embedding_layer(input=speaker2, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
-def outer_step(x1, x2):
-    index = [0]
-
-    def inner_step(ipt):
-        index[0] += 1
-        i = index[0]
-        outer_mem = memory(name="outer_rnn_state_%d" % i, size=hidden_dim)
-
-        def inner_step_impl(y):
-            inner_mem = memory(
-                name="inner_rnn_state_" + y.name,
-                size=hidden_dim,
-                boot_layer=outer_mem)
-            out = fc_layer(
-                input=[y, inner_mem],
-                size=hidden_dim,
-                act=TanhActivation(),
-                bias_attr=True,
-                name='inner_rnn_state_' + y.name)
-            return out
-
-        encoder = recurrent_group(
-            step=inner_step_impl, name='inner_%d' % i, input=ipt)
-        last = last_seq(name="outer_rnn_state_%d" % i, input=encoder)
-        return encoder, last
-
-    encoder1, sentence_last_state1 = inner_step(ipt=x1)
-    encoder2, sentence_last_state2 = inner_step(ipt=x2)
-
-    encoder1_expand = expand_layer(
-        input=sentence_last_state1, expand_as=encoder2)
-
-    return [encoder1_expand, encoder2]
-
-
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
-    targetInlink=emb2)
-
-encoder1_last = last_seq(input=encoder1_rep)
-encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
-context = mixed_layer(
-    input=[
-        identity_projection(encoder1_expandlast),
-        identity_projection(encoder2_rep)
-    ],
-    size=hidden_dim)
-
-rep = last_seq(input=context)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(
-    classification_cost(
-        input=prob, label=data_layer(
-            name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_recurrent.py b/paddle/gserver/tests/sequence_recurrent.py
deleted file mode 100644
index b88c09084e1bc167a177b59566e9794ac4d616c7..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_recurrent.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 128
-label_dim = 3
-
-# This config is designed to be equivalent with sequence_recurrent_group.py
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
-
-recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
-
-recurrent_last = last_seq(input=recurrent)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=recurrent_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_recurrent_group.py b/paddle/gserver/tests/sequence_recurrent_group.py
deleted file mode 100644
index 0daf746700231d302550004b1c10729e36807b8b..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_recurrent_group.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 128
-label_dim = 3
-
-# This config is designed to be equivalent with sequence_recurrent.py
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
-
-
-def step(y):
-    mem = memory(name="rnn_state", size=hidden_dim)
-    with mixed_layer(
-            name="rnn_state",
-            size=hidden_dim,
-            bias_attr=False,
-            act=SoftmaxActivation()) as out:
-        out += identity_projection(input=y)
-        out += full_matrix_projection(
-            input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
-    return out
-
-
-recurrent = recurrent_group(name="rnn", step=step, input=emb)
-
-recurrent_last = last_seq(input=recurrent)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=recurrent_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_rnn.conf b/paddle/gserver/tests/sequence_rnn.conf
deleted file mode 100644
index 1084edfe708c3348d40b67e270f64d8cda3cee0f..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn.conf
+++ /dev/null
@@ -1,57 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_seq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-def step(y):
-    mem = memory(name="rnn_state", size=hidden_dim)
-    out = fc_layer(input=[y, mem],
-                    size=hidden_dim,
-                    act=TanhActivation(),
-                    bias_attr=True,
-                    name="rnn_state")
-    return out
-
-out = recurrent_group(
-    name="rnn",
-    step=step,
-    input=emb)
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
deleted file mode 100644
index 41a581e0ccd59588d1bcce9345056bea9d80b73d..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn_matched_inputs.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_mixed')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 2
-hidden_dim = 2
-label_dim = 2
-
-data1 = data_layer(name="word1", size=dict_dim)
-data2 = data_layer(name="word2", size=dict_dim)
-label = data_layer(name="label", size=label_dim)
-
-encoding = embedding_layer(input=data2, size=word_dim)
-
-subseq = embedding_layer(input=data1, size=word_dim)
-seq = embedding_layer(input=data2, size=word_dim)
-nonseq = embedding_layer(input=label, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_mixed_inputs.conf
-def outer_step(subseq, seq, nonseq, encoding):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-
-    def inner_step(subseq, seq, nonseq):
-        inner_mem = memory(
-            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
-
-        out = fc_layer(
-            input=[subseq, seq, nonseq, inner_mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='inner_rnn_state')
-        return out
-
-    decoder = recurrent_group(
-        step=inner_step, name='inner', input=[subseq, seq, nonseq])
-    last = last_seq(name="outer_rnn_state", input=decoder)
-    context = simple_attention(
-        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
-    return context
-
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[
-        subseq, expand_layer(
-            seq, expand_as=subseq,
-            expand_level=ExpandLevel.FROM_SEQUENCE), expand_layer(
-                nonseq,
-                expand_as=subseq,
-                expand_level=ExpandLevel.FROM_NO_SEQUENCE),
-        StaticInput(encoding)
-    ])
-
-rep = last_seq(input=out)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
deleted file mode 100644
index ae89d8e2bb6f672eaf697ae4d24895b89f76544f..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_mixed')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 2
-hidden_dim = 2
-label_dim = 2
-
-data1 = data_layer(name="word1", size=dict_dim)
-data2 = data_layer(name="word2", size=dict_dim)
-label = data_layer(name="label", size=label_dim)
-
-encoding = embedding_layer(input=data2, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_matched_inputs.conf
-def outer_step(subseq, seq, nonseq, encoding):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-
-    def inner_step(data1, data2, label):
-        inner_mem = memory(
-            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
-
-        subseq = embedding_layer(input=data1, size=word_dim)
-        seq = embedding_layer(input=data2, size=word_dim)
-        nonseq = embedding_layer(input=label, size=word_dim)
-
-        print_layer(input=[data1, seq, label, inner_mem])
-        out = fc_layer(
-            input=[subseq, seq, nonseq, inner_mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='inner_rnn_state')
-        return out
-
-    decoder = recurrent_group(
-        step=inner_step, name='inner',
-        input=[subseq, StaticInput(seq), nonseq])
-    last = last_seq(name="outer_rnn_state", input=decoder)
-    context = simple_attention(
-        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
-    return context
-
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[data1, data2, StaticInput(label), StaticInput(encoding)])
-
-rep = last_seq(input=out)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/gserver/tests/sequence_rnn_multi_input.conf b/paddle/gserver/tests/sequence_rnn_multi_input.conf
deleted file mode 100644
index 9fae974f3079c49ad03d6ba34e30190f325414e8..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ /dev/null
@@ -1,58 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_seq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-def step(y, wid):
-    z = embedding_layer(input=wid, size=word_dim)
-    mem = memory(name="rnn_state", size=hidden_dim)
-    out = fc_layer(input=[y, z, mem],
-                    size=hidden_dim,
-                    act=TanhActivation(),
-                    bias_attr=True,
-                    name="rnn_state")
-    return out
-
-out = recurrent_group(
-    name="rnn",
-    step=step,
-    input=[emb, data])
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py b/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
deleted file mode 100644
index 6473fb3f3eddc803282911a156c489e4ba39aded..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_unequalength_seq')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 2
-
-speaker1 = data_layer(name="word1", size=dict_dim)
-speaker2 = data_layer(name="word2", size=dict_dim)
-
-emb1 = embedding_layer(input=speaker1, size=word_dim)
-emb2 = embedding_layer(input=speaker2, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the RNN in
-# sequence_nest_rnn_multi_unequalength_inputs.conf
-
-
-def step(x1, x2):
-    def calrnn(y):
-        mem = memory(name='rnn_state_' + y.name, size=hidden_dim)
-        out = fc_layer(
-            input=[y, mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='rnn_state_' + y.name)
-        return out
-
-    encoder1 = calrnn(x1)
-    encoder2 = calrnn(x2)
-    return [encoder1, encoder2]
-
-
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="stepout", step=step, input=[emb1, emb2])
-
-encoder1_last = last_seq(input=encoder1_rep)
-encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
-context = mixed_layer(
-    input=[
-        identity_projection(encoder1_expandlast),
-        identity_projection(encoder2_rep)
-    ],
-    size=hidden_dim)
-
-rep = last_seq(input=context)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(
-    classification_cost(
-        input=prob, label=data_layer(
-            name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
deleted file mode 100644
index b5e4af26dc123be3748adb4faed5fe1656ca44b3..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-void testActivation(const string& act) {
-  LOG(INFO) << "test activation: " << act;
-  size_t size = 10;
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type(act);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  act + "_activation",
-                  100,
-                  /* trans= */ false,
-                  useGpu,
-                  /* useWeight */ true);
-  }
-}
-
-TEST(Activation, activation) {
-  auto types = ActivationFunction::getAllRegisteredTypes();
-  std::set<string> excluded{"sequence_softmax"};
-  for (auto type : types) {
-    if (excluded.count(type)) continue;
-    testActivation(type);
-  }
-}
-
-void testSequenceSoftmaxAct(bool hasSubseq) {
-  LOG(INFO) << "test activation: sequence softmax";
-
-  const size_t size = 1;
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sequence_softmax");
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       1,
-       0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "sequence_softmax",
-                  100,
-                  /* trans= */ false,
-                  useGpu,
-                  /* useWeight */ true);
-  }
-}
-
-TEST(SequenceSoftmaxActivation, activation) {
-  for (auto hasSubseq : {false, true}) {
-    LOG(INFO) << "hasSubseq = " << hasSubseq;
-    testSequenceSoftmaxAct(hasSubseq);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
deleted file mode 100644
index a3ec66c75829c5ef0ae834656ee82e40be76c892..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/cuda/include/hl_batch_norm.h"
-#include "paddle/math/tests/TensorCheck.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Test that the batchNormLayer can be followed by a ConvLayer
-TEST(Layer, batchNorm) {
-  FLAGS_use_gpu = false;
-  TestConfig configBN;
-  const int CHANNELS = 6272;
-  const int IMG_SIZE = 1;
-  configBN.layerConfig.set_type("batch_norm");
-  configBN.layerConfig.set_name("bn");
-  configBN.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
-  configBN.layerConfig.set_active_type("relu");
-  configBN.biasSize = CHANNELS;
-  configBN.inputDefs.push_back({INPUT_DATA,
-                                "layer_0",
-                                /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
-                                /* paraSize= */ CHANNELS});
-
-  configBN.inputDefs.push_back(
-      {INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  configBN.inputDefs.back().isStatic = true;
-  configBN.inputDefs.push_back(
-      {INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  configBN.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = configBN.layerConfig.add_inputs();
-  configBN.layerConfig.add_inputs();
-  configBN.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-
-  // Setting up conv-layer config
-  TestConfig config;
-  config.biasSize = 64;
-  config.layerConfig.set_type("exconv");
-  config.layerConfig.set_num_filters(64);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "bn", 6272, 204800});
-  input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(5);
-  conv->set_filter_size_y(5);
-  conv->set_channels(128);
-  conv->set_padding(1);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(7);
-  conv->set_output_x(3);
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                              config.layerConfig.num_filters());
-  config.layerConfig.set_name("conv");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(configBN,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                "batch_norm",
-                100,
-                false,
-                false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr bnLayer;
-  initTestLayer(configBN, &layerMap, &parameters, &bnLayer);
-
-  std::vector<ParameterPtr> parameters2;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap, &parameters2, &convLayer);
-
-  bnLayer->forward(PASS_GC);
-  convLayer->forward(PASS_GC);
-
-  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getHeight()), 100);
-  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
-}
-
-#ifdef PADDLE_WITH_CUDA
-void batchNormInference(int n, int c, int h, int w) {
-  MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudaOut = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudnnCheck = std::make_shared<CpuMatrix>(n, c * h * w);
-  MatrixPtr cudaCheck = std::make_shared<CpuMatrix>(n, c * h * w);
-  input->randomizeUniform();
-  cudnnOut->zeroMem();
-  cudaOut->zeroMem();
-
-  MatrixPtr scale = std::make_shared<GpuMatrix>(1, c);
-  scale->randomizeUniform();
-  MatrixPtr bias = std::make_shared<GpuMatrix>(1, c);
-  bias->randomizeUniform();
-
-  MatrixPtr movingMean = std::make_shared<GpuMatrix>(1, c);
-  movingMean->randomizeUniform();
-
-  MatrixPtr movingVar = std::make_shared<GpuMatrix>(1, c);
-  movingVar->randomizeUniform();
-  movingVar->clip(0.01, 50);
-
-  hl_tensor_descriptor ioDesc;
-  hl_tensor_descriptor bnDesc;
-  hl_create_tensor_descriptor(&ioDesc);
-  hl_create_tensor_descriptor(&bnDesc);
-  hl_tensor_reshape(ioDesc, n, c, h, w);
-  hl_tensor_reshape(bnDesc, 1, c, 1, 1);
-
-  double EPS = 1E-5;
-  hl_batch_norm_forward_inference(ioDesc,
-                                  input->getData(),
-                                  ioDesc,
-                                  cudnnOut->getData(),
-                                  bnDesc,
-                                  scale->getData(),
-                                  bias->getData(),
-                                  movingMean->getData(),
-                                  movingVar->getData(),
-                                  EPS);
-
-  hl_batch_norm_cuda_inference(input->getData(),
-                               cudaOut->getData(),
-                               scale->getData(),
-                               bias->getData(),
-                               movingMean->getData(),
-                               movingVar->getData(),
-                               EPS,
-                               n,
-                               c,
-                               h,
-                               w);
-
-  cudnnCheck->copyFrom(*cudnnOut);
-  cudaCheck->copyFrom(*cudaOut);
-  autotest::TensorCheckErr(*cudnnCheck, *cudaCheck);
-
-  hl_destroy_tensor_descriptor(ioDesc);
-  hl_destroy_tensor_descriptor(bnDesc);
-}
-
-TEST(BatchNorm, Inference) {
-  batchNormInference(33, 267, 1, 1);
-  batchNormInference(19, 105, 4, 4);
-}
-#endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_CRFLayerGrad.cpp b/paddle/gserver/tests/test_CRFLayerGrad.cpp
deleted file mode 100644
index 9f3d2936569af8f1923a471f4d262e9a472649c0..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_CRFLayerGrad.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/LinearChainCRF.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-static inline bool getNextSequence(std::vector<int>& seq, int numClasses) {
-  for (auto& v : seq) {
-    if (++v < numClasses) {
-      return true;
-    }
-    v = 0;
-  }
-  return false;
-}
-
-// log(exp(x) + exp(y))
-static inline real logSum(real x, real y) {
-  real maxValue = std::max(x, y);
-  if (std::isinf(maxValue)) {
-    return -std::numeric_limits<real>::infinity();
-  } else {
-    return maxValue + log(exp(x - maxValue) + exp(y - maxValue));
-  }
-}
-
-static inline std::vector<int> genRandLabels(int numClasses, int length) {
-  std::vector<int> labels(length);
-  for (int i = 0; i < length; ++i) {
-    labels[i] = rand() % numClasses;  // NOLINT
-  }
-  return labels;
-}
-
-TEST(CRFLayer, cost) {
-  const int numClasses = 4;
-  CpuVector para(numClasses * (numClasses + 2));
-  real* a = para.getData();
-  real* b = para.getData() + numClasses;
-  real* w = para.getData() + 2 * numClasses;
-  LinearChainCRF crf(4, para.getData());
-  for (int length : {1, 2, 3, 10}) {
-    for (int tries = 0; tries < 10; ++tries) {
-      CpuMatrix x(length, numClasses);
-      x.randomizeUniform();
-      para.randnorm(0, 2);
-
-      std::vector<int> goldenLabels = genRandLabels(numClasses, length);
-
-      real cost = crf.forward(x.getData(), goldenLabels.data(), length);
-
-      real logZ = -std::numeric_limits<real>::infinity();
-      real logNominator = -std::numeric_limits<real>::infinity();
-      std::vector<int> testResult(length, 0);
-      do {
-        real score = a[testResult.front()];
-        score += x.getElement(0, testResult.front());
-        for (int k = 1; k < length; ++k) {
-          score += x.getElement(k, testResult[k]) +
-                   w[numClasses * testResult[k - 1] + testResult[k]];
-        }
-        score += b[testResult.back()];
-        logZ = logSum(logZ, score);
-
-        if (goldenLabels == testResult) {
-          logNominator = score;
-        }
-      } while (getNextSequence(testResult, numClasses));
-
-      real trueCost = -logNominator + logZ;
-
-      real diff = fabs(trueCost - cost);
-      diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost);
-      VLOG(1) << "cost=" << cost << " trueCost=" << trueCost << " diff=" << diff
-              << std::endl;
-      if (typeid(real) == typeid(double)) {  // NOLINT
-        EXPECT_LE(diff, 1e-10);
-      } else {
-        EXPECT_LE(diff, 5e-3);
-      }
-    }
-  }
-}
-
-inline real epsilon() { return typeid(real) == typeid(double) ? 1e-10 : 0.06; }
-
-TestConfig initTestConfig(size_t numClasses, bool withWeight) {
-  TestConfig config;
-  config.layerConfig.set_type("crf");
-  config.layerConfig.set_size(numClasses);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
-                              "layer_0",
-                              numClasses,
-                              numClasses * (numClasses + 2)});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_LABEL, "layer_label", numClasses, 0});
-  config.layerConfig.add_inputs();
-
-  if (withWeight) {
-    config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", 1, 0});
-    config.layerConfig.add_inputs();
-  }
-
-  return config;
-}
-
-TEST(Layer, CRFLayer) {
-  size_t numClasses = 10;
-  for (int tries = 0; tries < 5; ++tries) {
-    TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
-    for (int length : {1, 3, 100}) {
-      // Not support GPU now
-      testLayerGrad(config,
-                    "crf",
-                    length,
-                    /* trans= */ false,
-                    /* useGpu= */ false,
-                    /* useWeight= */ false,
-                    epsilon());
-    }
-  }
-}
-
-TEST(Layer, CRFLayerUseWeight) {
-  size_t numClasses = 10;
-  for (int tries = 0; tries < 5; ++tries) {
-    TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
-    for (int length : {1, 3, 100}) {
-      // Not support GPU now
-      testLayerGrad(config,
-                    "crf",
-                    length,
-                    /* trans= */ false,
-                    /* useGpu= */ false,
-                    /* useWeight= */ false,
-                    epsilon());
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_CompareSparse.cpp b/paddle/gserver/tests/test_CompareSparse.cpp
deleted file mode 100644
index 2fbc404125a9364ac44a990f8ec92962cf7d1298..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_CompareSparse.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/PythonUtil.h>
-
-#include "paddle/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-#include <paddle/pserver/ParameterServer2.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile1 = "gserver/tests/sequence_lstm.conf";
-
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_int32(gpu_id);
-DECLARE_int32(seed);
-DECLARE_int32(num_passes);
-DECLARE_int32(saving_period);
-
-DECLARE_int32(num_gradient_servers);
-DECLARE_int32(port);
-DECLARE_bool(local);
-DECLARE_bool(use_old_updater);
-DECLARE_bool(parallel_nn);
-DECLARE_string(config_args);
-DEFINE_double(max_diff_ratio,
-              0.0f,
-              "max diff ratio allowed for parameters value");
-
-int gNumDevices = 0;
-
-std::vector<ParameterPtr> trainerOnePassTest(const string& configFile,
-                                             bool sparseUpdate,
-                                             int trainerCount = 1,
-                                             bool useGpu = false) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  FLAGS_config_args = sparseUpdate ? "sparse_update=1" : "sparse_update=0";
-
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile << " sparseUpdate=" << sparseUpdate;
-  srand(FLAGS_seed);
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  ThreadLocalRandomEngine::get().seed(FLAGS_seed);
-  if (useGpu) {
-    CHECK_LE(trainerCount, gNumDevices);
-  }
-
-  std::vector<std::shared_ptr<ParameterServer2>> pservers;
-  if (!FLAGS_local) {
-    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-    pservers.resize(numPorts);
-
-    for (int i = 0; i < numPorts; ++i) {
-      pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
-      pservers[i]->init();
-      pservers[i]->start();
-    }
-  }
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig());
-  trainer.train();
-  return trainer.getGradientMachine()->getParameters();
-}
-
-std::vector<ParameterPtr>& getDenseParameters() {
-  static std::vector<ParameterPtr> denseParameters;
-  if (denseParameters.empty()) {
-    // use dense training as base
-    FLAGS_local = true;
-    denseParameters = trainerOnePassTest(configFile1, false);
-  }
-
-  return denseParameters;
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 double maxDiffRatio) {
-  double maxDiff = 0;
-  double maxValue = 0;
-  for (size_t i = 0; i < len; ++i) {
-    double diff = fabs(A[i] - B[i]);
-    maxValue = std::max<double>(maxValue, std::max(fabs(A[i]), fabs(B[i])));
-    maxDiff = std::max(maxDiff, diff);
-  }
-  EXPECT_LE(maxDiff / maxValue, maxDiffRatio);
-  LOG(INFO) << " maxDiff=" << maxDiff << " maxValue=" << maxValue
-            << " maxDiff/maxValue=" << maxDiff / maxValue << "\n\n";
-}
-
-void compareValue(const vector<ParameterPtr>& parametersA,
-                  const vector<ParameterPtr>& parametersB,
-                  double maxDiffRatio = 0.0) {
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "para_A",
-                paraB.getData(),
-                "para_B",
-                paraA.getSize(),
-                maxDiffRatio);
-  }
-}
-
-TEST(compareSparse, cpu) {
-  FLAGS_local = 1;  // disable remote sparse update in parameter config
-  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
-  compareValue(getDenseParameters(), parameters);
-}
-
-TEST(compareSparse, remote_cpu) {
-  FLAGS_local = 0;  // will enable remote sparse update
-  FLAGS_ports_num_for_sparse = 5;
-  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
-  compareValue(getDenseParameters(), parameters);
-}
-
-TEST(compareSparse, cpu10_local_vs_remote) {
-  FLAGS_local = 1;  // disable remote sparse update in parameter config
-  std::vector<ParameterPtr> localParameters =
-      trainerOnePassTest(configFile1, true, 2);
-
-  FLAGS_local = 0;  // will enable remote sparse update
-  FLAGS_ports_num_for_sparse = 5;
-  std::vector<ParameterPtr> remoteParameters =
-      trainerOnePassTest(configFile1, true, 2);
-
-  compareValue(localParameters, remoteParameters);
-}
-
-TEST(compareSparse, multiGradientMachine) {
-  int numGpu;
-#ifdef PADDLE_TYPE_DOUBLE
-  double eps = 1e-8;
-#else
-  double eps = 1e-4;
-#endif
-  numGpu = hl_get_device_count();
-  for (bool local : {false, true}) {
-    FLAGS_local = local;
-    FLAGS_ports_num_for_sparse = 5;
-    for (bool useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-      if (useGpu) continue;
-#endif
-      FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
-      int trainerCount = useGpu ? numGpu : 2;
-      std::vector<ParameterPtr> parameters =
-          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
-      compareValue(getDenseParameters(), parameters, eps);
-    }
-  }
-  FLAGS_parallel_nn = false;
-}
-
-TEST(compareSparse, NeuralNetwork) {
-#ifdef PADDLE_TYPE_DOUBLE
-  double eps = 1e-8;
-#else
-  double eps = 1e-4;
-#endif
-  for (bool local : {false, true}) {
-    FLAGS_local = local;
-    FLAGS_ports_num_for_sparse = 5;
-    for (bool useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-      if (useGpu) continue;
-#endif
-      FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
-      int trainerCount = 1;
-      std::vector<ParameterPtr> parameters =
-          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
-      compareValue(getDenseParameters(), parameters, useGpu ? eps : 0);
-    }
-  }
-  FLAGS_parallel_nn = false;
-}
-
-int main(int argc, char** argv) {
-  // FIXME(tonyyang-svail):
-  //   Turn off this test due CI failure:
-  //   https://paddleci.ngrok.io/viewLog.html?buildId=27608&buildTypeId=Paddle_PrCi&tab=buildLog&_focus=10430
-  return 0;
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  gNumDevices = hl_get_device_count();
-  FLAGS_num_passes = 1;          // train one pass
-  FLAGS_saving_period = 100000;  // do not save parameter
-
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_CompareTwoNets.cpp b/paddle/gserver/tests/test_CompareTwoNets.cpp
deleted file mode 100644
index 1c9b4002a34ca5a9b668be69bd0ad392eb763803..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_CompareTwoNets.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-
-#include "paddle/trainer/Trainer.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-
-DECLARE_bool(local);
-DECLARE_bool(use_gpu);
-
-DECLARE_string(config);
-DECLARE_string(nics);
-
-DEFINE_bool(need_high_accuracy,
-            false,
-            "whether need to run in double accuracy");
-DEFINE_double(
-    max_diff_ratio,
-    0.0f,
-    "max diff ratio allowed for outputs and parameters (value/gradient)");
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_int32(seed);
-
-static const string& config_file_a = "gserver/tests/sequence_recurrent.py";
-static const string& config_file_b =
-    "gserver/tests/sequence_recurrent_group.py";
-
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-void calcGradient(ComData& data, const string configFile) {
-  FLAGS_config = configFile;
-
-  FLAGS_local = true;
-  FLAGS_use_gpu = false;
-
-  FLAGS_nics = "";
-
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  srand(FLAGS_seed);
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
-
-  data.parameters = trainer.getGradientMachine()->getParameters();
-
-  DataBatch dataBatch;
-  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
-
-  trainer.getDataProvider()->reset();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
-
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  vector<Argument>& inArgs = dataBatch.getStreams();
-
-  trainer.getGradientMachine()->start();
-  trainer.getGradientMachine()->forwardBackward(
-      inArgs, &data.outArgs, PASS_TRAIN);
-
-  trainer.getGradientMachine()->finish();
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  real maxVal = 0;
-  for (size_t i = 0; i < len; ++i) {
-    maxVal = std::max(maxVal, std::max(A[i], B[i]));
-  }
-  real maxDiff = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    maxDiff = std::max(maxDiff, diff);
-    if (diff > maxVal * FLAGS_max_diff_ratio) {
-      nNum++;
-      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i] << "    "
-              << desB << " : " << B[i] << " diff=" << diff;
-    }
-  }
-  EXPECT_EQ(0, nNum);
-  LOG(INFO) << "maxValue=" << maxVal << " maxDiff=" << maxDiff << "\n\n";
-}
-
-void compareGradient(ComData& comDataA, ComData& comDataB) {
-  vector<Argument> outArgsA = comDataA.outArgs;
-  vector<Argument> outArgsB = comDataB.outArgs;
-
-  for (size_t i = 0; i < outArgsA.size(); ++i) {
-    CpuMatrix matA(outArgsA[i].value->getHeight(),
-                   outArgsA[i].value->getWidth());
-    CpuMatrix matB(outArgsB[i].value->getHeight(),
-                   outArgsB[i].value->getWidth());
-
-    matA.copyFrom(*outArgsA[i].value);
-    matB.copyFrom(*outArgsB[i].value);
-
-    LOG(INFO) << "\n--------------------------------"
-              << " Check Network Output_" << i << ":"
-              << " -------------------------------------\n";
-    checkBuffer(matA.getData(),
-                "network A output",
-                matB.getData(),
-                "network B output",
-                matA.getElementCnt(),
-                matA.getWidth());
-  }
-
-  vector<ParameterPtr>& parametersA = comDataA.parameters;
-  vector<ParameterPtr>& parametersB = comDataB.parameters;
-
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "Network A",
-                paraB.getData(),
-                "Network B",
-                paraA.getSize());
-
-    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
-    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
-              << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(),
-                "Network A",
-                gradB.getData(),
-                "Network B",
-                gradA.getSize());
-  }
-}
-
-TEST(Trainer, create) {
-  ComData dataA;
-  calcGradient(dataA, config_file_a);
-  LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
-
-  ComData dataB;
-  calcGradient(dataB, config_file_b);
-  LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
-
-  compareGradient(dataA, dataB);
-}
-
-int main(int argc, char** argv) {
-  FLAGS_thread_local_rand_use_global_seed = true;
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  if (FLAGS_need_high_accuracy) {
-    LOG(INFO) << "skip test due to it's need high accuracy";
-    return 0;
-  }
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 1e-5;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in low accuracy mode";
-  }
-#else
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 1e-10;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in high accuracy mode";
-  }
-#endif
-
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
deleted file mode 100644
index 2e394a74b7d53fc53727d817c06479d545ade65d..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Test that the convTrans forward is the same as conv backward
-TEST(Layer, convTransLayerFwd) {
-  // Setting up conv-trans layer
-  TestConfig configt;
-  configt.biasSize = 3;
-  configt.layerConfig.set_type("exconvt");
-  configt.layerConfig.set_num_filters(3);
-  configt.layerConfig.set_partial_sum(1);
-  configt.layerConfig.set_shared_biases(true);
-
-  configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
-  LayerInputConfig* input = configt.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(3 / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                               configt.layerConfig.num_filters());
-  configt.layerConfig.set_name("convTrans");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      configt, &dataLayers, &datas, &layerMap, "convTrans", 100, false, false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convtLayer;
-  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
-  convtLayer->getBiasParameter()->zeroMem();
-  convtLayer->forward(PASS_GC);
-
-  // Setting up conv-layer config
-  TestConfig config;
-  config.biasSize = 16;
-  config.layerConfig.set_type("exconv");
-  config.layerConfig.set_num_filters(16);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384});
-  input = config.layerConfig.add_inputs();
-  conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(3);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                              config.layerConfig.num_filters());
-  config.layerConfig.set_name("conv");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers2;
-  LayerMap layerMap2;
-  vector<Argument> datas2;
-  initDataLayer(
-      config, &dataLayers2, &datas2, &layerMap2, "conv", 100, false, false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters2;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap2, &parameters2, &convLayer);
-
-  // Sync convLayer and convtLayer parameter
-  convLayer->getBiasParameter()->zeroMem();
-  convLayer->getParameters()[0]
-      ->getBuf(PARAMETER_VALUE)
-      ->copyFrom(*(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)));
-
-  // Set convLayer outputGrad as convTransLayer input value
-  convLayer->forward(PASS_GC);
-  convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue()));
-
-  vector<int> callbackFlags(parameters2.size(), 0);
-  auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
-  convLayer->backward(callback);
-
-  // Check that the convLayer backward is the same as convTransLayer forward
-  checkMatrixEqual(convtLayer->getOutputValue(),
-                   dataLayers2[0]->getOutputGrad());
-}
-
-// Do one forward pass of convTrans layer and check to see if its output
-// matches the given result
-void doOneConvtTest(size_t imgSize,
-                    size_t output_x,
-                    size_t stride,
-                    size_t padding,
-                    size_t filter_size,
-                    MatrixPtr& result) {
-  TestConfig configt;
-  configt.biasSize = 1;
-  configt.layerConfig.set_type("exconvt");
-  configt.layerConfig.set_num_filters(1);
-  configt.layerConfig.set_partial_sum(1);
-  configt.layerConfig.set_shared_biases(true);
-
-  configt.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", output_x * output_x, filter_size * filter_size});
-  LayerInputConfig* input = configt.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(filter_size);
-  conv->set_filter_size_y(filter_size);
-  conv->set_channels(1);
-  conv->set_padding(padding);
-  conv->set_padding_y(padding);
-  conv->set_stride(stride);
-  conv->set_stride_y(stride);
-  conv->set_groups(1);
-  conv->set_filter_channels(1);
-  conv->set_img_size(imgSize);
-  conv->set_output_x(output_x);
-
-  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                               configt.layerConfig.num_filters());
-  configt.layerConfig.set_name("convTrans");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      configt, &dataLayers, &datas, &layerMap, "convTrans", 1, false, false);
-  dataLayers[0]->getOutputValue()->zeroMem();
-  dataLayers[0]->getOutputValue()->add(1.0);
-
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convtLayer;
-  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
-  convtLayer->getBiasParameter()->zeroMem();
-  convtLayer->getParameters()[0]->zeroMem();
-  convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0);
-  convtLayer->forward(PASS_GC);
-
-  checkMatrixEqual(convtLayer->getOutputValue(), result);
-}
-
-TEST(Layer, convTransLayerFwd2) {
-  MatrixPtr result;
-  result = Matrix::create(1, 5 * 5, false, false);
-  result->zeroMem();
-  result->add(1.0);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 1,
-                 /* stride */ 1,
-                 /* padding */ 0,
-                 /* filter_size */ 5,
-                 result);
-
-  real resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
-                       4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
-  result->setData(resultData);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 1,
-                 /* padding */ 0,
-                 /* filter_size */ 4,
-                 result);
-
-  real resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
-                        4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
-  result->setData(resultData2);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 2,
-                 /* padding */ 1,
-                 /* filter_size */ 5,
-                 result);
-
-  real resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4,
-                        2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1};
-  result->setData(resultData3);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 2,
-                 /* padding */ 0,
-                 /* filter_size */ 3,
-                 result);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
deleted file mode 100644
index ba820d9a2acabf95ff816705e4df124bb95da077..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Do one forward pass of ConvLayer using either exconv or cudnn_conv
-MatrixPtr doOneConvTest(size_t imgSize,
-                        size_t output_x,
-                        size_t stride,
-                        size_t padding,
-                        size_t filter_size,
-                        size_t channel,
-                        size_t numfilters,
-                        size_t groups,
-                        MatrixPtr& inputData,
-                        real* param,
-                        bool useGpu,
-                        bool isDeconv = false) {
-  TestConfig config;
-  config.biasSize = numfilters;
-  string layerType;
-  if (useGpu) {
-    layerType = (isDeconv) ? "cudnn_convt" : "cudnn_conv";
-  } else {
-    layerType = (isDeconv) ? "exconvt" : "exconv";
-  }
-  config.layerConfig.set_type(layerType);
-  config.layerConfig.set_num_filters(numfilters);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  size_t weightSize = channel * filter_size * filter_size *
-                      config.layerConfig.num_filters() / groups;
-  if (isDeconv) {
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", output_x * output_x * channel, weightSize});
-    config.layerConfig.set_size(imgSize * imgSize *
-                                config.layerConfig.num_filters());
-  } else {
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
-    config.layerConfig.set_size(output_x * output_x *
-                                config.layerConfig.num_filters());
-  }
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(filter_size);
-  conv->set_filter_size_y(filter_size);
-  conv->set_channels(channel);
-  conv->set_padding(padding);
-  conv->set_padding_y(padding);
-  conv->set_stride(stride);
-  conv->set_stride_y(stride);
-  conv->set_groups(groups);
-  conv->set_img_size(imgSize);
-  conv->set_output_x(output_x);
-
-  if (isDeconv) {
-    conv->set_filter_channels(numfilters / groups);
-  } else {
-    conv->set_filter_channels(channel / groups);
-  }
-
-  config.layerConfig.set_name("conv");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      config, &dataLayers, &datas, &layerMap, "conv", 1, false, useGpu);
-  dataLayers[0]->getOutputValue()->zeroMem();
-  dataLayers[0]->getOutputValue()->copyFrom(*inputData);
-
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap, &parameters, &convLayer);
-  convLayer->getBiasParameter()->zeroMem();
-  convLayer->getParameters()[0]->zeroMem();
-  convLayer->getParameters()[0]
-      ->getBuf(PARAMETER_VALUE)
-      ->copyFrom(param, weightSize);
-  convLayer->forward(PASS_GC);
-
-  return convLayer->getOutputValue();
-}
-
-TEST(Layer, convParaUnified) {
-#ifdef PADDLE_WITH_CUDA
-  MatrixPtr input, resultCpu, resultGpu;
-
-  /// TEST1 for conv ///
-  input = Matrix::create(1, 4 * 4, false, false);
-  real inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
-  real param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
-
-  input->setData(inputData);
-
-  resultCpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST1 for deconv ///
-  input = Matrix::create(1, 2 * 2, false, false);
-  real inputDataT[] = {1, 2, 3, 4};
-  input->setData(inputDataT);
-
-  resultCpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST2 for conv ///
-  input = Matrix::create(1, 3 * 3 * 2, false, false);
-  real inputData2[] = {
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
-  real param2[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1};
-
-  input->setData(inputData2);
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST3 for conv ///
-  real param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST2 for deconv ///
-  input = Matrix::create(1, 2 * 2 * 2, false, false);
-  real inputData2T[] = {1, 2, 3, 4, 5, 6, 7, 8};
-  input->setData(inputData2T);
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST3 for deconv ///
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-#endif
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
deleted file mode 100644
index 0041ed30939d1a6111a2db753da6172bb65e374b..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
+++ /dev/null
@@ -1,352 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-#include <sstream>
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-const size_t MAX_SEQ_NUM = 23;
-const size_t MAX_SEQ_LEN = 50;
-const size_t MAX_BEAM_SIZE = 27;
-
-const size_t SEED = (size_t)(time(NULL));
-
-struct SingleBeamExpansion {
-  vector<int> seqStartPos;
-  vector<int> subSeqStartPos;
-  vector<real> candidateScores;
-
-  // TODO(caoying): store this into Argument.ids
-  vector<real> selectedIndices;
-
-  vector<int> groundTruth;
-  vector<size_t> inBeam;
-  vector<int> rowIdxInBeam;
-  vector<int> colIdxInBeam;
-
-  void resetGroundTruth(size_t n) {
-    groundTruth.clear();
-    groundTruth.resize(n, -1);
-
-    inBeam.clear();
-    inBeam.resize(n, 0);
-
-    rowIdxInBeam.clear();
-    rowIdxInBeam.resize(n, -1);
-
-    colIdxInBeam.clear();
-    colIdxInBeam.resize(n, -1);
-  }
-};
-
-inline float randFloat() {
-  return static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
-}
-
-void genRand(real* numbers, size_t n) {
-  default_random_engine generator;
-  uniform_real_distribution<real> distribution(0.0, 1.0);
-  for (size_t i = 0; i < n; ++i) numbers[i] = distribution(generator);
-}
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-void genCandidateScores(bool hasSubseq,
-                        size_t beamSize,
-                        SingleBeamExpansion& prevBeam,
-                        SingleBeamExpansion& curBeam) {
-  vector<int>& seqStartPos = curBeam.seqStartPos;
-  seqStartPos.resize(1, 0);
-  vector<int>& subSeqStartPos = curBeam.subSeqStartPos;
-  subSeqStartPos.resize(1, 0);
-
-  srand(SEED);
-  if (prevBeam.selectedIndices.size()) {
-    if (prevBeam.subSeqStartPos.size() > 1) {
-      int seqIdx = 1;
-      // samples in previous beam are nested sequences.
-      for (size_t i = 1; i < prevBeam.subSeqStartPos.size(); ++i) {
-        for (size_t j = 0; j < beamSize; ++j) {
-          if (prevBeam.selectedIndices[(i - 1) * beamSize + j] == -1.) break;
-          subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) +
-                                   subSeqStartPos.back());
-        }
-        if (prevBeam.seqStartPos[seqIdx] == prevBeam.subSeqStartPos[i]) {
-          seqStartPos.push_back(subSeqStartPos.back());
-          seqIdx++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i <= prevBeam.selectedIndices.size(); ++i) {
-        if (i && i % beamSize == 0) {
-          seqStartPos.push_back(subSeqStartPos.back());
-          if (i == prevBeam.selectedIndices.size()) break;
-        }
-        if (prevBeam.selectedIndices[i] == -1.) continue;
-        subSeqStartPos.push_back(subSeqStartPos.back() +
-                                 (1 + (rand() % MAX_SEQ_LEN)));
-      }
-    }
-  } else {
-    // the first beam expansion
-    int seqNum = 1 + (rand() % MAX_SEQ_NUM);
-    for (int i = 0; i < seqNum; ++i) {
-      if (hasSubseq) {
-        for (size_t j = 0; j < 1 + (rand() % MAX_SEQ_NUM); ++j)
-          subSeqStartPos.push_back(subSeqStartPos.back() +
-                                   (1 + (rand() % MAX_SEQ_LEN)));
-        seqStartPos.push_back(subSeqStartPos.back());
-      } else {
-        seqStartPos.push_back(seqStartPos.back() +
-                              (1 + (rand() % MAX_SEQ_LEN)));
-      }
-    }
-  }
-
-  size_t totalSeqNum = hasSubseq ? subSeqStartPos.back() : seqStartPos.back();
-  curBeam.candidateScores.resize(totalSeqNum, 0.);
-  genRand(curBeam.candidateScores.data(), totalSeqNum);
-}
-
-void genSelectedIndices(size_t beamSize,
-                        vector<int>& seqStartPos,
-                        vector<real>& selectedIndices) {
-  size_t selectedIdsCount = beamSize * (seqStartPos.size() - 1);
-  selectedIndices.resize(selectedIdsCount, -1.);
-
-  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
-    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-    int n = min(seqLen, static_cast<int>(beamSize));
-    vector<real> ids = randSampling(seqLen, n);
-    memcpy(selectedIndices.data() + i * beamSize,
-           ids.data(),
-           sizeof(real) * ids.size());
-  }
-}
-
-void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
-                    size_t beamSize) {
-  SingleBeamExpansion& beam = beamExpansions[1];
-  size_t seqNum = beam.seqStartPos.size() - 1;
-  for (size_t i = 2; i < beamExpansions.size(); ++i)
-    CHECK_EQ(seqNum, beamExpansions[i].seqStartPos.size() - 1);
-
-  srand(SEED);
-
-  // initialize the first beam.
-  beam.resetGroundTruth(seqNum);
-  for (size_t i = 0; i < seqNum; ++i) {
-    if (randFloat() > 0.5) {
-      /*
-       * force the randomly generated label falls in the beam by chance 0.5.
-       * otherwise, when sequence length is relatively long and beam size is
-       * relatively small, the gold sequences falls off the beam at in the
-       * first search.
-       */
-      real* begPos = beam.selectedIndices.data() + i * beamSize;
-      beam.colIdxInBeam[i] =
-          rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
-            return val != -1.;
-          });
-      beam.groundTruth[i] =
-          beam.selectedIndices[i * beamSize + beam.colIdxInBeam[i]];
-      beam.inBeam[i] = 1;
-    } else {
-      int label = rand() % (beam.seqStartPos[i + 1] - beam.seqStartPos[i]);
-      beam.groundTruth[i] = label;
-
-      real* begPos = beam.selectedIndices.data() + i * beamSize;
-      real* endPos = begPos + beamSize;
-      real* lblPos = find(begPos, endPos, real(label));
-      if (lblPos != endPos) {
-        beam.inBeam[i] = 1;
-        beam.colIdxInBeam[i] = lblPos - begPos;
-      }
-    }
-    beam.rowIdxInBeam[i] = i;
-  }
-
-  // iterate over each beam expansions
-  for (size_t i = 2; i < beamExpansions.size(); ++i) {
-    SingleBeamExpansion& curBeam = beamExpansions[i];
-    SingleBeamExpansion& prevBeam = beamExpansions[i - 1];
-    curBeam.resetGroundTruth(seqNum);
-
-    // iterate over each sequence
-    for (size_t j = 0; j < seqNum; ++j) {
-      if (!prevBeam.inBeam[j]) continue;
-
-      // gold sequence falls in the beam in previous search.
-      real* begPos = prevBeam.selectedIndices.data();
-      int offset =
-          prevBeam.rowIdxInBeam[j] * beamSize + prevBeam.colIdxInBeam[j];
-      curBeam.rowIdxInBeam[j] = count_if(
-          begPos, begPos + offset, [](const real& val) { return val != -1.; });
-
-      if (randFloat() > 0.5) {
-        // force the randomly generated label falls in the beam by chance 0.5.
-
-        real* start =
-            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
-        int n = rand() % count_if(start, start + beamSize, [](const real& val) {
-                  return val != -1.;
-                });
-        curBeam.colIdxInBeam[j] = n;
-        curBeam.groundTruth[j] = *(start + n);
-        curBeam.inBeam[j] = 1;
-      } else {
-        CHECK_LE((size_t)curBeam.rowIdxInBeam[j] + 1,
-                 curBeam.subSeqStartPos.size() - 1);
-        int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]];
-        int end = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j] + 1];
-        CHECK_GT(size_t(end), size_t(start));
-        int label = rand() % (end - start);
-
-        curBeam.groundTruth[j] = label;
-        real* findBeg =
-            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
-        real* lblPos =
-            find(findBeg, findBeg + beamSize, static_cast<real>(label));
-        if (lblPos != (findBeg + beamSize)) {
-          curBeam.inBeam[j] = 1;
-          curBeam.colIdxInBeam[j] = lblPos - findBeg;
-        }
-      }
-    }
-  }
-}
-
-void genOneBeam(size_t beamSize,
-                bool hasSubseq,
-                SingleBeamExpansion& prevBeam,
-                SingleBeamExpansion& curBeam) {
-  genCandidateScores(hasSubseq, beamSize, prevBeam, curBeam);
-  genSelectedIndices(beamSize,
-                     hasSubseq ? curBeam.subSeqStartPos : curBeam.seqStartPos,
-                     curBeam.selectedIndices);
-}
-
-void genRandomBeamExpansion(size_t expansionCount,
-                            size_t beamSize,
-                            vector<SingleBeamExpansion>& beamExpansions) {
-  beamExpansions.clear();
-  beamExpansions.resize(expansionCount + 1);
-
-  // beamExpansions[0] is reserved.
-  for (size_t i = 1; i <= expansionCount; ++i)
-    genOneBeam(beamSize, bool(i - 1), beamExpansions[i - 1], beamExpansions[i]);
-  genGroundTruth(beamExpansions, beamSize);
-}
-
-void testCrossEntropyOverBeam(bool useGpu,
-                              size_t beamSize,
-                              vector<SingleBeamExpansion>& beams) {
-  TestConfig config;
-  config.layerConfig.set_type("cross_entropy_over_beam");
-
-  size_t seqNum = 0;
-  for (size_t i = 1; i < beams.size(); ++i) {
-    const SingleBeamExpansion& beam = beams[i];
-    // create scores for all the candidates
-    MatrixPtr candidateScorePtr =
-        Matrix::create(beam.candidateScores.size(), 1, false, false);
-    candidateScorePtr->copyFrom(beam.candidateScores.data(),
-                                beam.candidateScores.size());
-
-    ostringstream paramName;
-    paramName << "candidate_scores_" << i;
-
-    if (beam.subSeqStartPos.size() > 1) {
-      seqNum = beam.subSeqStartPos.size() - 1;
-      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                  paramName.str(),
-                                  candidateScorePtr,
-                                  beam.seqStartPos,
-                                  beam.subSeqStartPos});
-    } else {
-      seqNum = beam.seqStartPos.size() - 1;
-      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                  paramName.str(),
-                                  candidateScorePtr,
-                                  beam.seqStartPos});
-    }
-    config.layerConfig.add_inputs();
-
-    // create indices for the selected candidates
-    MatrixPtr selectedCandidates =
-        Matrix::create(seqNum, beamSize, false, false);
-    selectedCandidates->copyFrom(beam.selectedIndices.data(),
-                                 beam.selectedIndices.size());
-    paramName.clear();
-    paramName << "selected_candidates_" << i;
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, paramName.str(), selectedCandidates});
-    config.layerConfig.add_inputs();
-
-    // create the ground truth
-    paramName.clear();
-    paramName << "label_" << i;
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, paramName.str(), beam.groundTruth});
-    config.layerConfig.add_inputs();
-  }
-
-  testLayerGrad(
-      config, "cross_entropy_over_beam", seqNum, false, useGpu, false);
-}
-
-TEST(Layer, CrossEntropyOverBeam) {
-  LOG(INFO) << "SEED = " << SEED;
-  const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
-  LOG(INFO) << "beamSize = " << beamSize;
-
-  // TODO(caoying): test with random beam expansions.
-  const size_t expansionCount = 3;
-  vector<SingleBeamExpansion> beams;
-  genRandomBeamExpansion(expansionCount, beamSize, beams);
-
-  for (bool useGpu : {false, true})
-    testCrossEntropyOverBeam(useGpu, beamSize, beams);
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(SEED);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_KmaxSeqScore.cpp b/paddle/gserver/tests/test_KmaxSeqScore.cpp
deleted file mode 100644
index 168ffbdac8cd6fb0ee4fa62e3766905c30d1844b..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-vector<int> randSampling(int range, int n) {
-  CHECK_GE(range, n);
-  vector<int> num(range);
-  iota(begin(num), end(num), 0);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  return num;
-}
-
-void genRandomSeqInfo(vector<int>& seqStartPosition,
-                      vector<int>& subSeqStartPosition) {
-  const int maxSeqNum = 100;
-  // generate random start position information
-  int seqNum = 1 + (rand() % maxSeqNum);
-  seqStartPosition.resize(seqNum + 1, 0);
-  subSeqStartPosition.resize(1, 0);
-
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqLen = 1 + (rand() % maxSeqNum);
-    for (int j = 0; j < subSeqLen; ++j)
-      subSeqStartPosition.push_back(subSeqStartPosition.back() + subSeqLen);
-    seqStartPosition[i + 1] = subSeqStartPosition.back();
-  }
-}
-
-void genRandomGroundTruth(real* values,
-                          vector<vector<int>>& groundTruth,
-                          vector<int>& startPos,
-                          size_t beamSize) {
-  groundTruth.resize(startPos.size() - 1, vector<int>(beamSize, -1));
-  for (size_t i = 0; i < startPos.size() - 1; ++i) {
-    int seqLen = startPos[i + 1] - startPos[i];
-    vector<int> pos =
-        randSampling(seqLen, min(static_cast<int>(beamSize), seqLen));
-    for (size_t j = 0; j < pos.size(); ++j) {
-      groundTruth[i][j] = pos[j];
-      values[startPos[i] + pos[j]] = 1.;
-    }
-  }
-}
-
-void checkLayerOut(vector<vector<int>> groundTruth,
-                   real* layerOut,
-                   size_t beamSize) {
-  for (size_t i = 0; i < groundTruth.size(); ++i) {
-    int begPos = i * beamSize;
-    vector<real> tmp(layerOut + begPos, layerOut + begPos + beamSize);
-    sort(begin(tmp), end(tmp));
-    sort(begin(groundTruth[i]), end(groundTruth[i]));
-    for (size_t j = 0; j < beamSize; ++j) CHECK_EQ(tmp[j], groundTruth[i][j]);
-  }
-}
-
-TEST(Layer, kmaxSeqScoreLayer) {
-  const size_t maxBeamSize = 100;
-  size_t beamSize = 1 + (rand() % maxBeamSize);
-
-  vector<int> seqStartPosition;
-  vector<int> subSeqStartPosition;
-  genRandomSeqInfo(seqStartPosition, subSeqStartPosition);
-  MatrixPtr inValue =
-      Matrix::create(subSeqStartPosition.back(), 1, false, false);
-
-  std::vector<bool> mode = {false};
-#ifdef PADDLE_WITH_CUDA
-  mode.push_back(true);
-#endif
-
-  for (auto hasSubseq : {false, true}) {
-    vector<vector<int>> groundTruth;
-    inValue->randomizeUniform();
-    genRandomGroundTruth(inValue->getData(),
-                         groundTruth,
-                         hasSubseq ? subSeqStartPosition : seqStartPosition,
-                         beamSize);
-
-    for (auto useGpu : mode) {
-      TestConfig config;
-      config.layerConfig.set_type("kmax_seq_score");
-      config.layerConfig.set_beam_size(beamSize);
-
-      if (hasSubseq) {
-        config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                    "scores",
-                                    inValue,
-                                    seqStartPosition,
-                                    subSeqStartPosition});
-      } else {
-        config.inputDefs.push_back(
-            {INPUT_SELF_DEFINE_DATA, "scores", inValue, seqStartPosition});
-      }
-      config.layerConfig.add_inputs();
-
-      // data layer initialize
-      std::vector<DataLayerPtr> dataLayers;
-      LayerMap layerMap;
-      vector<Argument> datas;
-      initDataLayer(
-          config,
-          &dataLayers,
-          &datas,
-          &layerMap,
-          "kmax_seq_score",
-          100 /* actually this parameter is unused in self-defined input*/,
-          false,
-          useGpu);
-      // test layer initialize
-      std::vector<ParameterPtr> parameters;
-      LayerPtr kmaxSeqScoreLayer;
-      FLAGS_use_gpu = useGpu;
-      initTestLayer(config, &layerMap, &parameters, &kmaxSeqScoreLayer);
-      kmaxSeqScoreLayer->forward(PASS_TRAIN);
-
-      const MatrixPtr outValue = kmaxSeqScoreLayer->getOutputValue();
-      CHECK_EQ(outValue->getHeight(),
-               hasSubseq ? subSeqStartPosition.size() - 1
-                         : seqStartPosition.size() - 1);
-      CHECK_EQ(outValue->getWidth(), beamSize);
-      checkLayerOut(groundTruth, outValue->getData(), beamSize);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand((size_t)(time(NULL)));
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
deleted file mode 100644
index 1254d580505512dc8fd7e34a053a7538832d271f..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ /dev/null
@@ -1,2532 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-#include <cudnn.h>
-#endif
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/math/MathUtils.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-TEST(Operator, dot_mul) {
-  TestConfig config;
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  operatorConf.set_type("dot_mul");
-  operatorConf.set_dotmul_scale(-1);
-
-  testOperatorGrad(config, operatorConf, 100, false, false);
-}
-
-TEST(Projection, context) {
-  for (auto contextStart : {-5, -3, -1, 0, 3}) {
-    for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto batchSize : {1, 2, 5, 20}) {
-        for (auto trainablePadding : {false, true}) {
-          LOG(INFO) << " contextStart=" << contextStart
-                    << " contextLength=" << contextLength
-                    << " batchSize=" << batchSize
-                    << " trainablePadding=" << trainablePadding;
-          ProjectionConfig conf;
-          conf.set_type("context");
-          conf.set_input_size(10);
-          conf.set_context_start(contextStart);
-          conf.set_context_length(contextLength);
-          conf.set_trainable_padding(trainablePadding);
-          conf.set_output_size(conf.context_length() * conf.input_size());
-          int pad =
-              std::max(0, -conf.context_start()) +
-              std::max(0, conf.context_start() + conf.context_length() - 1);
-          for (auto useGpu : {false, true}) {
-            testProjectionGrad(
-                conf,
-                INPUT_SEQUENCE_DATA,
-                trainablePadding ? conf.input_size() * pad : 0,
-                batchSize,
-                useGpu,
-                contextStart + contextLength <= 1);  // = testState
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Projection, trans_fc) {
-  ProjectionConfig conf;
-  conf.set_type("trans_fc");
-  conf.set_input_size(50);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 1000,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, fc) {
-  ProjectionConfig conf;
-  conf.set_type("fc");
-  conf.set_input_size(10);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 200,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, dot_mul) {
-  ProjectionConfig conf;
-  conf.set_type("dot_mul");
-  conf.set_input_size(20);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 20,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, table) {
-  ProjectionConfig conf;
-  conf.set_type("table");
-  conf.set_input_size(10);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_LABEL,
-                       /* parameterSize */ 200,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, identity) {
-  ProjectionConfig conf;
-  conf.set_type("identity");
-  conf.set_input_size(10);
-  conf.set_output_size(10);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 0,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, slice) {
-  ProjectionConfig conf;
-  conf.set_type("slice");
-  conf.set_input_size(100);
-  SliceConfig& slice1 = *conf.add_slices();
-  slice1.set_start(10);
-  slice1.set_end(20);
-  SliceConfig& slice2 = *conf.add_slices();
-  slice2.set_start(50);
-  slice2.set_end(70);
-  conf.set_output_size(30);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 0,
-                       /* batchSize */ 10,
-                       useGpu);
-  }
-}
-
-TEST(Projection, scaling) {
-  ProjectionConfig conf;
-  conf.set_type("scaling");
-  conf.set_input_size(10);
-  conf.set_output_size(10);
-  for (auto useGpu : {false}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 1,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-void testProjectionConv(size_t groups, bool isDeconv) {
-  const int NUM_FILTERS = 18;
-  const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 2;
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 16;
-
-#if CUDNN_VERSION >= 6000
-  const int DILATION = 2;
-#else
-  const int DILATION = 1;
-#endif
-
-  ProjectionConfig conf;
-  if (isDeconv) {
-    conf.set_type("convt");
-  } else {
-    conf.set_type("conv");
-  }
-  conf.set_num_filters(NUM_FILTERS);
-
-  ConvConfig* conv = conf.mutable_conv_conf();
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_channels(CHANNELS);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_dilation(DILATION);
-  conv->set_dilation_y(DILATION);
-  conv->set_groups(groups);
-  if (isDeconv) {
-    conv->set_filter_channels(NUM_FILTERS / conv->groups());
-  } else {
-    conv->set_filter_channels(conv->channels() / conv->groups());
-  }
-  conv->set_img_size(IMAGE_SIZE);
-  int output_x = outputSize(conv->img_size(),
-                            (conv->filter_size() - 1) * DILATION + 1,
-                            conv->padding(),
-                            conv->stride(),
-                            /* caffeMode */ true);
-  int output_y = outputSize(conv->img_size(),
-                            (conv->filter_size_y() - 1) * DILATION + 1,
-                            conv->padding_y(),
-                            conv->stride_y(),
-                            /* caffeMode */ true);
-  conv->set_output_x(output_x);
-  conv->set_output_y(output_y);
-  LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
-            << "; output_y: " << output_y;
-  if (isDeconv) {
-    int deconv_image_x = imageSize(output_x,
-                                   (conv->filter_size() - 1) * DILATION + 1,
-                                   conv->padding(),
-                                   conv->stride(),
-                                   /* caffeMode */ true);
-    int deconv_image_y = imageSize(output_y,
-                                   (conv->filter_size_y() - 1) * DILATION + 1,
-                                   conv->padding_y(),
-                                   conv->stride_y(),
-                                   /* caffeMode */ true);
-
-    LOG(INFO) << " deconv_image_x: " << deconv_image_x
-              << "; deconv_image_y: " << deconv_image_y;
-    conf.set_input_size(output_x * output_y * CHANNELS);
-    conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
-  } else {
-    conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
-    conf.set_output_size(output_x * output_y * NUM_FILTERS);
-  }
-
-  testProjectionGrad(conf,
-                     INPUT_DATA,
-                     /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE *
-                         FILTER_SIZE_Y / groups,
-                     /* batchSize */ 100,
-                     true,
-                     false,
-                     NUM_FILTERS,
-                     true);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(Projection, conv) {
-  /// test ConvProjection
-  testProjectionConv(1, false);
-  testProjectionConv(3, false);
-  /// test ConvTransProjection
-  testProjectionConv(1, true);
-  testProjectionConv(3, true);
-}
-#endif
-
-TEST(Layer, BilinearInterpLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("bilinear_interp");
-  config.biasSize = 0;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
-  ImageConfig* image = bilinear->mutable_image_conf();
-  image->set_img_size(32);
-  image->set_img_size_y(32);
-  image->set_channels(4);
-
-  for (auto useGpu : {false, true}) {
-    for (auto outSize : {32, 64}) {
-      bilinear->set_out_size_x(outSize);
-      bilinear->set_out_size_y(outSize);
-      testLayerGrad(config, "bilinear_interp", 10, false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, concat) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("concat");
-  config.layerConfig.set_size(15);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "concat", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, AddtoLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "addto", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, CTCLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("ctc");
-  config.layerConfig.set_norm_by_times(false);
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "ctc",
-                  100,
-                  /* trans */ false, /* useGpu */
-                  useGpu);
-  }
-}
-
-TEST(Layer, cosSimLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("cos");
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cos", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, CosSimVecMatLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("cos_vm");
-  config.layerConfig.set_size(5);  // output size
-  config.layerConfig.set_cos_scale(2.0);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cos_vm", 100, false, useGpu);
-  }
-}
-
-void testDepthwiseConvLayer(const string& type, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 32;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(32);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(3);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(16);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_img_size_y(8);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
-}
-
-TEST(Layer, depthwiseConvLayer) {
-  //  'depthwise_conv' is a sepecial case of 'exconv' whose
-  //  groups size equals to the input channels size.
-  testDepthwiseConvLayer("exconv", /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testDepthwiseConvLayer("exconv", /* useGpu= */ true);
-#endif
-}
-
-void testConvLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 16;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(16);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  int dilation = 2;
-  if (type == "cudnn_conv") {
-#if CUDNN_VERSION >= 6000
-    dilation = 2;
-#else
-    dilation = 1;
-#endif
-  }
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 192});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(2);
-  conv->set_channels(3);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_dilation(dilation);
-  conv->set_dilation_y(dilation);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_img_size_y(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                (conv->filter_size() - 1) * dilation + 1,
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                (conv->filter_size_y() - 1) * dilation + 1,
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "conv", 100, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "conv", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, convLayer) {
-  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
-  testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void testConvTransLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 3;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(3);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(3 / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-
-  config.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "convTrans", 100, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "convTrans", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, convTransLayer) {
-  for (auto useGpu : {false, true}) {
-    testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
-  }
-#ifdef PADDLE_WITH_CUDA
-  testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-TEST(Layer, blockExpandLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("blockexpand");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 6144, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  BlockExpandConfig* blockExpand = input->mutable_block_expand_conf();
-  blockExpand->set_img_size_x(64);
-  blockExpand->set_img_size_y(32);
-  blockExpand->set_channels(3);
-  blockExpand->set_padding_x(0);
-  blockExpand->set_padding_y(0);
-  blockExpand->set_block_x(4);
-  blockExpand->set_block_y(32);
-  blockExpand->set_stride_x(2);
-  blockExpand->set_stride_y(2);
-  blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
-                                       blockExpand->block_x(),
-                                       blockExpand->padding_x(),
-                                       blockExpand->stride_x(),
-                                       /* caffeMode */ false));
-  blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
-                                       blockExpand->block_y(),
-                                       blockExpand->padding_y(),
-                                       blockExpand->stride_y(),
-                                       /* caffeMode */ false));
-  config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() *
-                              blockExpand->channels());
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "blockexpand", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, maxoutLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("maxout");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  MaxOutConfig* maxout = input->mutable_maxout_conf();
-  ImageConfig* image = maxout->mutable_image_conf();
-
-  image->set_img_size(32);
-  image->set_img_size_y(32);
-  image->set_channels(4);
-  maxout->set_groups(2);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "maxout", 10, false, useGpu);
-  }
-}
-
-void testFcLayer(string format, size_t nnz) {
-  TestConfig config;
-  config.biasSize = 1024;
-  config.layerConfig.set_type("fc");
-  config.layerConfig.set_size(1024);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_drop_rate(0.1);
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", 2048, nnz, ParaSparse(format)});
-  config.layerConfig.add_inputs();
-
-  LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
-            << config.inputDefs[0].sparse.format;
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "fc",
-                  100,
-                  /* trans */ false,
-                  useGpu,
-                  /* weight */ true);
-  }
-}
-
-TEST(Layer, fcLayer) {
-  testFcLayer("", 1024 * 1024 * 2);
-  testFcLayer("csc", 1024 * 10);
-  testFcLayer("csr", 1024 * 10);
-}
-
-TEST(Layer, SelectiveFullyConnectedLayer) {
-  TestConfig config;
-  size_t nin = 16;
-  size_t nout = 256;
-  config.layerConfig.set_type("selective_fc");
-  config.layerConfig.set_size(nout);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_has_selected_colums(true);
-  config.layerConfig.set_selective_fc_pass_generation(false);
-  config.biasSize = nout;
-
-  config.inputDefs.push_back({INPUT_DATA, "input0", nin, nin * nout});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)});
-  config.layerConfig.add_inputs();
-
-  testLayerGrad(config,
-                "selective_fc",
-                100,
-                /* trans= */ false,
-                /* useGup= */ false,
-                false);
-#ifdef PADDLE_WITH_CUDA
-  testLayerGrad(config,
-                "selective_fc",
-                100,
-                /* trans= */ false,
-                /* useGup= */ true,
-                false);
-#endif
-}
-
-TEST(Layer, DataNormLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("data_norm");
-  config.layerConfig.set_size(20);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 100});
-  config.inputDefs.back().isStatic = true;
-  config.layerConfig.add_inputs();
-
-  for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
-    config.layerConfig.set_data_norm_strategy(strategy);
-    // The parameters are static, so not support GPU now
-    testLayerGrad(config,
-                  "data_norm",
-                  200,
-                  /* trans */ false,
-                  /* useGpu */ false);
-  }
-}
-
-TEST(Layer, hsigmoidLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("hsigmoid");
-  config.layerConfig.set_num_classes(5);
-  config.layerConfig.set_size(1);
-  config.biasSize = config.layerConfig.num_classes() - 1;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 200});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 5, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "hsigmoid",
-                  100,
-                  /* trans */ false,
-                  /* useGpu */ useGpu);
-  }
-}
-
-TEST(Layer, multi_cross) {
-  TestConfig config;
-  config.layerConfig.set_type("multi-class-cross-entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(
-        config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, multi_binary_label_sparse_mat) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_binary_label_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "multi_binary_label_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(layer, multi_binary_label_id) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_binary_label_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "multi_binary_label_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(Layer, multi_cross_with_selfnorm) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_class_cross_entropy_with_selfnorm");
-  config.layerConfig.set_softmax_selfnorm_alpha(0.1);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config,
-                "multi_class_cross_entropy_with_selfnorm",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, multi_cross_soft) {
-  TestConfig config;
-  config.layerConfig.set_type("soft_binary_class_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "soft_binary_class_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(Layer, square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, sparse_square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config,
-                "square_error",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, sparse_float_square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_FLOAT_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config,
-                "square_error",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, square_error_weighted) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-  config.testAccumulate = false;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, huber_regression_loss) {
-  TestConfig config;
-  config.layerConfig.set_type("huber_regression");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto delta : {1, 3, 5}) {
-      config.layerConfig.set_delta(delta);
-      testLayerGrad(config, "huber_regression", 100, /* trans */ false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, huber_two_class) {
-  TestConfig config;
-  config.layerConfig.set_type("huber_classification");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 2, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "huber_two_class", 100, /* trans */ false, useGpu);
-  }
-}
-
-void testExpandLayer(string trans_type, bool hasSubseq) {
-  TestConfig config;
-  config.layerConfig.set_type("expand");
-
-  config.inputDefs.push_back(
-      {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       10,
-       0});
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_1",
-       10,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.set_trans_type(trans_type);
-  LOG(INFO) << " trans_type=" << trans_type << " hasSubseq=" << hasSubseq;
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "expand", 30, false, useGpu);
-  }
-}
-
-TEST(Layer, ExpandLayer) {
-  testExpandLayer("non-seq", false);  // non-seq expand to seq
-  testExpandLayer("non-seq", true);   // non-seq expand to hasSubseq
-  testExpandLayer("seq", true);       // seq expand to hasSubseq
-}
-
-void testDegradeLayer(bool hasSubseq,
-                      string layer_type,
-                      string trans_type,
-                      int stride) {
-  TestConfig config;
-  config.layerConfig.set_type(layer_type);
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_seq_pool_stride(stride);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       10,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.set_trans_type(trans_type);
-
-  auto testDegradeLayerGrad = [](TestConfig& config, string layer_type) {
-    for (auto useGpu : {false, true}) {
-      testLayerGrad(config, layer_type, 100, false, useGpu);
-    }
-  };
-
-  if (layer_type == "average") {
-    for (auto strategy : {"average", "sum", "squarerootn"}) {
-      LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-                << " average_strategy=" << strategy
-                << " seq_pool_stride=" << stride;
-      config.layerConfig.set_average_strategy(strategy);
-      testDegradeLayerGrad(config, layer_type);
-    }
-  } else {
-    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-              << " seq_pool_stride=" << stride;
-    testDegradeLayerGrad(config, layer_type);
-  }
-}
-
-TEST(Layer, MaxLayer) {
-  testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
-  testDegradeLayer(false,
-                   "max",
-                   "non-seq",
-                   5);  // seq max to a shorten seq, stride window = 5
-  testDegradeLayer(true, "max", "non-seq", -1);  // hasSubseq max to non-seq
-  testDegradeLayer(true, "max", "seq", -1);      // hasSubseq max to seq
-}
-
-TEST(Layer, SequenceLastInstanceLayer) {
-  testDegradeLayer(false,
-                   "seqlastins",
-                   "non-seq",
-                   -1);  // seq seqlastins to non-seq
-  testDegradeLayer(false,
-                   "seqlastins",
-                   "non-seq",
-                   5);  // seq seqlastins to a shorten seq, stride window = 5
-  testDegradeLayer(true,
-                   "seqlastins",
-                   "non-seq",
-                   -1);  // hasSubseq seqlastins to non-seq
-  testDegradeLayer(true,
-                   "seqlastins",
-                   "seq",
-                   -1);  // hasSubseq seqlastins to seq
-}
-
-TEST(Layer, AverageLayer) {
-  testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
-  testDegradeLayer(false,
-                   "average",
-                   "non-seq",
-                   5);  // seq average to a shorten seq, stride window = 5
-  testDegradeLayer(true,
-                   "average",
-                   "non-seq",
-                   -1);                          // hasSubseq average to non-seq
-  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
-}
-
-TEST(Layer, SequenceConcatLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("seqconcat");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "seqconcat", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, SequenceReshapeLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("seqreshape");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 100, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "seqreshape", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ConvShiftLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("conv_shift");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 3, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config, "conv_shift", 100, false, false);
-}
-
-TEST(Layer, PowerLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("power");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "power", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ConvexCombinationLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("convex_comb");
-  config.layerConfig.set_size(20);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "convex_comb", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, InterpolationLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("interpolation");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_2", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "interpolation", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, DotProdLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("dot_prod");
-  config.layerConfig.set_size(1);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "dot_prod", 10, false, useGpu);
-  }
-}
-
-TEST(Layer, OuterProdLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("out_prod");
-  config.layerConfig.set_size(100);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "out_prod", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, SlopeInterceptLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("slope_intercept");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_slope(1.0);
-  config.layerConfig.set_intercept(0.1);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "slope_intercept", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ScalingLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("scaling");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "scaling", 100, false, useGpu);
-  }
-}
-
-void testNormLayer(const string& normType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.layerConfig.set_type("norm");
-  config.layerConfig.set_active_type("relu");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type(normType);
-  norm->set_channels(16);
-  norm->set_size(5);
-  norm->set_scale(0.001);
-  norm->set_pow(0.75);
-  norm->set_blocked(0);
-  norm->set_img_size(14);
-  norm->set_img_size_y(7);
-  norm->set_output_x(norm->img_size());
-  norm->set_output_y(norm->img_size_y());
-  if (norm->norm_type() == "cmrnorm" ||
-      norm->norm_type() == "cmrnorm-projection") {
-    norm->set_scale(norm->scale() / norm->size());
-  } else {
-    norm->set_scale(norm->scale() / (norm->size() * norm->size()));
-  }
-
-  config.layerConfig.set_size(norm->output_x() * norm->output_y() *
-                              norm->channels());
-  config.biasSize = 0;
-
-  testLayerGrad(config, "norm", 100, trans, useGpu);
-}
-
-TEST(Layer, NormLayer) {
-  testNormLayer("cmrnorm-projection",
-                /* trans= */ false, /* useGpu= */
-                true);
-  testNormLayer("cmrnorm-projection",
-                /* trans= */ false, /* useGpu= */
-                false);
-}
-
-void setPoolConfig(TestConfig* config,
-                   PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(16);
-
-  int kw = 3, kh = 3;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(16);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-void testPoolLayer(const string& poolType,
-                   bool trans,
-                   bool useGpu,
-                   bool excludeMode = true) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(14);
-  pool->set_img_size_y(14);
-  pool->set_exclude_mode(excludeMode);
-  setPoolConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool", 100, trans, useGpu);
-}
-
-#ifdef PADDLE_WITH_CUDA
-void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_size_y(4);
-  pool->set_stride_y(3);
-  pool->set_img_size(10);
-  pool->set_img_size_y(20);
-  setPoolConfig(&config, pool, poolType);
-  pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
-                         ((float)pool->stride_y()) +
-                     1.5);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool", 100, trans, useGpu);
-}
-#endif
-
-TEST(Layer, PoolLayer) {
-  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
-  testPoolLayer("avg-projection",
-                /* trans= */ false,
-                /* useGpu= */ false,
-                /* excludeMode= */ false);
-  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
-  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
-
-#ifdef PADDLE_WITH_CUDA
-  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("avg-projection",
-                /* trans= */ false,
-                /* useGpu= */ true,
-                /* excludeMode= */ false);
-  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-avg-incl-pad-pool",
-                 /* trans= */ false,
-                 /* useGpu= */ true);
-  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void setPool3DConfig(TestConfig* config,
-                     PoolConfig* pool,
-                     const string& poolType) {
-  // filter size
-  const int NUM_FILTERS = 16;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-  const int CHANNELS = 16;
-
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool3d");
-  (*config).layerConfig.set_num_filters(NUM_FILTERS);
-
-  int kw = FILTER_SIZE, kh = FILTER_SIZE_Y, kd = FILTER_SIZE_Z;
-  int pw = 0, ph = 0, pd = 0;
-  int sw = 2, sh = 2, sd = 2;
-
-  pool->set_pool_type(poolType);
-  pool->set_pool_type("avg");
-  pool->set_channels(CHANNELS);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_size_z(kd);
-  pool->set_padding(0);
-  pool->set_padding_y(0);
-  pool->set_padding_z(0);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-  pool->set_stride_z(sd);
-  pool->set_start(0);
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  int od = outputSize(pool->img_size_z(), kd, pd, sd, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-  pool->set_output_z(od);
-}
-
-void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 11664, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  const int IMAGE_SIZE = 9;
-  const int IMAGE_SIZE_Y = 9;
-  const int IMAGE_SIZE_Z = 9;
-
-  pool->set_img_size(IMAGE_SIZE);
-  pool->set_img_size_y(IMAGE_SIZE_Y);
-  pool->set_img_size_z(IMAGE_SIZE_Z);
-
-  setPool3DConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool3d", 100, trans, useGpu);
-}
-
-TEST(Layer, Pool3DLayer) {
-  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
-  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
-  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void testSppLayer(const string& poolType,
-                  const int pyramidHeight,
-                  bool trans,
-                  bool useGpu) {
-  TestConfig config;
-  config.layerConfig.set_type("spp");
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  SppConfig* sppConfig = input->mutable_spp_conf();
-  sppConfig->set_pool_type(poolType);
-  sppConfig->set_pyramid_height(pyramidHeight);
-  ImageConfig* imageConfig = sppConfig->mutable_image_conf();
-  imageConfig->set_channels(16);
-  imageConfig->set_img_size(10);
-  imageConfig->set_img_size_y(20);
-  int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
-  config.layerConfig.set_size(outputSize * imageConfig->channels());
-  testLayerGrad(config, "spp", 100, trans, useGpu);
-}
-
-TEST(Layer, SpatialPyramidPoolLayer) {
-  for (auto useGpu : {false, true}) {
-    for (auto pyramidHeight : {1, 2, 3}) {
-      testSppLayer("avg-projection", pyramidHeight, false, useGpu);
-      testSppLayer("max-projection", pyramidHeight, false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, rankCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("rank-cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "rank-cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, sumCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("sum_cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "sum_cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, weightedRankCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("rank-cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_3", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "weighted-rank-cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, TensorLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("tensor");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = config.layerConfig.size();
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 250});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 5, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "tensor", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, RecurrentLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("recurrent");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("tanh");
-  config.biasSize = 4;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 4, /* paraSize= */ 16});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(
-          config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
-    }
-  }
-}
-
-TEST(Layer, LstmLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("lstmemory");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("tanh");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 28;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 64});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(
-          config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
-    }
-  }
-  for (auto useGpu : {true}) {
-    config.testBatchState = true;
-    config.layerConfig.set_reversed(false);
-    testLayerGrad(config, "lstmemory", 10, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, MDLstmLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("mdlstmemory");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 4 * 9;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_MDIM_DATA, "layer_0", 4 * 5, 4 * 4 * 5});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_directions(true);
-  config.layerConfig.add_directions(true);
-
-  for (auto useGpu : {false, true}) {
-    for (int i = 0; i < 2; i++) {
-      for (int j = 0; j < 2; j++) {
-        config.layerConfig.set_directions(0, bool(i));
-        config.layerConfig.set_directions(1, bool(j));
-        testLayerGrad(config, "mdlstmemory", 100, false, useGpu);
-      }
-    }
-  }
-}
-
-TEST(Layer, ParameterReluLayer) {
-  auto testParameterReluLayer = [&](size_t inputSize, size_t channels) {
-    TestConfig config;
-    config.layerConfig.set_type("prelu");
-    config.inputDefs.push_back({INPUT_DATA, "layer_0", inputSize, channels});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_size(inputSize);
-    config.layerConfig.set_partial_sum(inputSize /
-                                       channels);  // size of feature map
-    for (auto useGpu : {false, true}) {
-      testLayerGrad(config, "prelu", 100, false, useGpu);
-    }
-  };
-
-  testParameterReluLayer(192, 1);
-  testParameterReluLayer(192, 3);
-  testParameterReluLayer(192, 192);
-}
-
-TEST(Layer, ResizeLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("resize");
-  config.layerConfig.set_size(64);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 16, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "resize", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, RotateLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("rotate");
-  const int CHANNEL = 2;
-  const int HEIGHT = 8;
-  const int WIDTH = 4;
-  const int INPUT_SIZE = HEIGHT * WIDTH * CHANNEL;
-  config.layerConfig.set_size(INPUT_SIZE);
-  config.layerConfig.set_height(HEIGHT);
-  config.layerConfig.set_width(WIDTH);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", INPUT_SIZE, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "rotate", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, NCELayer) {
-  TestConfig config;
-  size_t numClasses = 4;
-  config.layerConfig.set_type("nce");
-  config.layerConfig.set_size(1);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_num_classes(numClasses);
-  config.biasSize = numClasses;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 16 * numClasses});
-  config.inputDefs.push_back(
-      {INPUT_LABEL, "label", /* dim= */ numClasses, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto withWeight : {false, true}) {
-    if (withWeight) {
-      config.inputDefs.push_back(
-          {INPUT_DATA_TARGET, "weight", /* dim= */ 1, /* paraSize= */ 0});
-      config.layerConfig.add_inputs();
-    }
-
-    for (auto isIdLabel : {false, true}) {
-      config.inputDefs[1] = {
-          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
-          "label",
-          /* dim= */ numClasses,
-          /* paraSize= */ 0};
-
-      for (auto withDist : {false, true}) {
-        config.layerConfig.clear_neg_sampling_dist();
-        if (withDist) {
-          double sum = 0;
-          for (size_t i = 0; i < numClasses; ++i) {
-            real p = rand();  // NOLINT use rand_r
-            config.layerConfig.add_neg_sampling_dist(p);
-            sum += p;
-          }
-          for (size_t i = 0; i < numClasses; ++i) {
-            real p = config.layerConfig.neg_sampling_dist(i) / sum;
-            config.layerConfig.set_neg_sampling_dist(i, p);
-          }
-        }
-        LOG(INFO) << "NCELayer "
-                  << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight
-                  << " withDist=" << withDist;
-        // Not support GPU now
-        testLayerGrad(config,
-                      "nce",
-                      100,
-                      /* trans= */ false,
-                      /* useGpu */ false);
-      }
-    }
-  }
-}
-
-TEST(Layer, GatedRecurrentLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("gated_recurrent");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(config, "gated_recurrent", 100, /* trans= */ false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, GruStepLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("gru_step");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "gruStep", 100, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, LstmStepLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("lstm_step");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-  config.testAccumulate = false;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "lstmStep", 100, /* trans= */ false, useGpu);
-  }
-}
-
-void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  const int CHANNELS = 10;
-  const int IMG_SIZE = 16;
-  const int IMG_SIZE_Y = 8;
-  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA,
-                              "layer_0",
-                              /* dim= */ size,
-                              /* paraSize= */ CHANNELS});
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-  img_conf->set_img_size_y(IMG_SIZE_Y);
-
-  testLayerGrad(config,
-                "batch_norm",
-                64,
-                /* trans= */ trans,
-                useGpu,
-                /* useWeight */ true);
-}
-
-TEST(Layer, BatchNormalizationLayer) {
-  testBatchNormLayer("batch_norm", false, false);
-#ifdef PADDLE_WITH_CUDA
-  testBatchNormLayer("batch_norm", false, true);
-  if (hl_get_cudnn_lib_version() >= int(4000)) {
-    testBatchNormLayer("cudnn_batch_norm", false, true);
-  }
-#endif
-}
-
-void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  const int CHANNELS = 10;
-  const int IMG_SIZE = 16;
-  const int IMG_SIZE_Y = 8;
-  const int IMG_SIZE_Z = 8;
-  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y * IMG_SIZE_Z;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA,
-                              "layer_0",
-                              /* dim= */ size,
-                              /* paraSize= */ CHANNELS});
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-  img_conf->set_img_size_y(IMG_SIZE_Y);
-  img_conf->set_img_size_z(IMG_SIZE_Z);
-
-  testLayerGrad(config,
-                "batch_norm",
-                64,
-                /* trans= */ trans,
-                useGpu,
-                /* useWeight */ true);
-}
-
-TEST(Layer, testBatchNorm3DLayer) {
-  testBatchNorm3DLayer("batch_norm", false, false);
-#ifdef PADDLE_WITH_CUDA
-  testBatchNorm3DLayer("batch_norm", false, true);
-  if (hl_get_cudnn_lib_version() >= int(4000)) {
-    testBatchNorm3DLayer("cudnn_batch_norm", false, true);
-  }
-#endif
-}
-
-void testConvOperator(bool isDeconv) {
-  TestConfig config;
-  const int NUM_FILTERS = 16;
-  const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 3;
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 16;
-  const int IMAGE_SIZE_Y = 9;
-  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  if (isDeconv) {
-    operatorConf.set_type("convt");
-  } else {
-    operatorConf.set_type("conv");
-  }
-  ConvConfig* conv = operatorConf.mutable_conv_conf();
-  operatorConf.set_num_filters(NUM_FILTERS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_channels(CHANNELS);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_img_size(IMAGE_SIZE);
-  conv->set_img_size_y(IMAGE_SIZE_Y);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /*  caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /*  caffeMode */ true));
-
-  if (isDeconv) {
-    conv->set_filter_channels(NUM_FILTERS / conv->groups());
-    config.inputDefs.push_back({INPUT_DATA,
-                                "layer_0",
-                                conv->output_x() * conv->output_y() * CHANNELS,
-                                0});
-    config.layerConfig.set_size(IMAGE_SIZE * IMAGE_SIZE_Y * NUM_FILTERS);
-  } else {
-    conv->set_filter_channels(conv->channels() / conv->groups());
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
-    config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                                NUM_FILTERS);
-  }
-
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_1",
-       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
-}
-
-TEST(Operator, conv) {
-  testConvOperator(/*isDeconv*/ true);
-  testConvOperator(/*isDeconv*/ false);
-}
-
-TEST(Layer, FeatureMapExpandLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("featmap_expand");
-  const int CHANNELS = 10;
-  const int INPUT_SIZE = 100;
-  config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
-  config.layerConfig.set_num_filters(CHANNELS);
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
-                              "layer_0",
-                              /* dim= */ INPUT_SIZE,
-                              /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    for (auto asRowVec : {false, true}) {
-      config.layerConfig.set_user_arg(asRowVec ? "as_row_vec" : "as_col_vec");
-      testLayerGrad(config,
-                    "featmap_expand",
-                    /*batch_size*/ 100,
-                    /* trans= */ false,
-                    useGpu,
-                    /* useWeight */ true);
-    }
-  }
-}
-
-TEST(Layer, MultiplexLayer) {
-  TestConfig config;
-  const int LAYER_SIZE = 100;
-  config.layerConfig.set_type("multiplex");
-  config.layerConfig.set_size(LAYER_SIZE);
-
-  config.inputDefs.push_back({INPUT_LABEL, "layer_0", 2, 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_2", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "multiplex", 512, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, PadLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("pad");
-
-  int c = 4;
-  int h = 31;
-  int w = 36;
-  size_t size = c * h * w;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PadConfig* pad = input->mutable_pad_conf();
-  ImageConfig* image = pad->mutable_image_conf();
-
-  image->set_channels(c);
-  image->set_img_size(h);
-  image->set_img_size_y(w);
-  pad->add_pad_c(1);
-  pad->add_pad_c(2);
-  pad->add_pad_h(2);
-  pad->add_pad_h(3);
-  pad->add_pad_w(3);
-  pad->add_pad_w(5);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "pad", 10, false, useGpu);
-  }
-}
-
-TEST(Layer, CrossChannelNormLayer) {
-  TestConfig config;
-  config.paramInitialMean = 1.;
-  config.paramInitialStd = 0.;
-  config.layerConfig.set_type("norm");
-  config.layerConfig.set_size(100);
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type("cross-channel-norm");
-  norm->set_channels(10);
-  norm->set_size(100);
-  norm->set_scale(0);
-  norm->set_pow(0);
-  norm->set_blocked(0);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
-  }
-}
-
-TEST(Layer, smooth_l1) {
-  TestConfig config;
-  config.layerConfig.set_type("smooth_l1");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 200, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 200, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, multibox_loss) {
-  TestConfig config;
-  config.layerConfig.set_type("multibox_loss");
-  config.biasSize = 0;
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf();
-  multiboxLoss->set_num_classes(21);
-  multiboxLoss->set_input_num(1);
-  multiboxLoss->set_overlap_threshold(0.5);
-  multiboxLoss->set_neg_pos_ratio(3);
-  multiboxLoss->set_neg_overlap(0.5);
-  multiboxLoss->set_background_id(0);
-  multiboxLoss->set_height(3);
-  multiboxLoss->set_width(3);
-
-  size_t gtNum = 1;
-  MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false);
-  labelValue->randomizeUniform();
-  labelValue->add(-0.5);
-  labelValue->sigmoid(*labelValue);
-  real* labelData = labelValue->getData();
-  size_t labelWidth = labelValue->getWidth();
-  for (size_t i = 0; i < gtNum; ++i) {
-    *(labelData + i * labelWidth) = std::rand() % 20 + 1;
-    *(labelData + i * labelWidth + 1) = 0.400259;
-    *(labelData + i * labelWidth + 2) = 0.377857;
-    *(labelData + i * labelWidth + 3) = 0.525712;
-    *(labelData + i * labelWidth + 4) = 0.519368;
-  }
-  vector<int> seqStartPositions(gtNum + 1, 0);
-  for (size_t i = 1; i <= gtNum; ++i) {
-    seqStartPositions[i] = i;
-  }
-
-  // Ensure at lease one matched bbox
-  MatrixPtr priorValue = Matrix::create(1, 72, false, false);
-  priorValue->randomizeUniform();
-  priorValue->add(-0.5);
-  priorValue->sigmoid(*priorValue);
-  real* priorData = priorValue->getData();
-  *(priorData) = 0.424811;
-  *(priorData + 1) = 0.397059;
-  *(priorData + 2) = 0.538905;
-  *(priorData + 3) = 0.447091;
-  *(priorData + 4) = 0.425720;
-  *(priorData + 5) = 0.515228;
-  *(priorData + 6) = 0.519452;
-  *(priorData + 7) = 0.591065;
-
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}});
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
-  config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
-  config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "multibox_loss", 1, false, useGpu, false);
-  }
-}
-
-TEST(Layer, TransLayer) {
-  TestConfig config;
-  const int height = 128;
-  const int width = 256;
-  config.layerConfig.set_type("trans");
-  config.layerConfig.set_size(width);
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ height * width, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "trans", height, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, RowConvLayer) {
-  const int context = 3;
-  const int size = 512;
-
-  TestConfig config;
-  config.layerConfig.set_type("row_conv");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", size, context * size});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  RowConvConfig* conv = input->mutable_row_conv_conf();
-  conv->set_context_length(context);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "row_conv", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, CropLayer) {
-  TestConfig config;
-  // config input_0
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ImageConfig* img = input->mutable_image_conf();
-  img->set_channels(4);
-  img->set_img_size(16);
-  config.layerConfig.set_axis(2);
-  config.layerConfig.add_offset(0);
-  config.layerConfig.add_offset(0);
-
-  // config input_1
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 128, 0});
-  input = config.layerConfig.add_inputs();
-  img = input->mutable_image_conf();
-  img->set_channels(2);
-  img->set_img_size(8);
-
-  // config crop layer
-  config.layerConfig.set_type("crop");
-  config.layerConfig.set_name("cropLayer");
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "crop", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, roi_pool) {
-  TestConfig config;
-  config.layerConfig.set_type("roi_pool");
-  config.biasSize = 0;
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
-  roiPoolConf->set_pooled_width(7);
-  roiPoolConf->set_pooled_height(7);
-  roiPoolConf->set_spatial_scale(1. / 16);
-  roiPoolConf->set_width(14);
-  roiPoolConf->set_height(14);
-
-  const size_t roiNum = 10;
-  const size_t roiDim = 10;
-  const size_t batchSize = 5;
-  MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
-  roiValue->zeroMem();
-  real* roiData = roiValue->getData();
-  for (size_t i = 0; i < roiNum; ++i) {
-    roiData[i * roiDim + 0] = std::rand() % batchSize;
-    roiData[i * roiDim + 1] = std::rand() % 224;  // xMin
-    roiData[i * roiDim + 2] = std::rand() % 224;  // yMin
-    size_t xMin = static_cast<size_t>(roiData[i * roiDim + 1]);
-    size_t yMin = static_cast<size_t>(roiData[i * roiDim + 2]);
-    roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin);  // xMax
-    roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin);  // yMax
-  }
-
-  config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, SwitchOrderLayer) {
-  TestConfig config;
-  // config input_0
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ImageConfig* img = input->mutable_image_conf();
-  img->set_channels(4);
-  img->set_img_size(16);
-  img->set_img_size_y(16);
-
-  ReshapeConfig* reshape = config.layerConfig.mutable_reshape_conf();
-  reshape->add_height_axis(0);
-  reshape->add_height_axis(1);
-  reshape->add_height_axis(2);
-  reshape->add_width_axis(3);
-
-  // config softmax layer
-  config.layerConfig.set_type("switch_order");
-  config.layerConfig.set_name("switchOrderLayer");
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "switch_order", 100, false, useGpu, true);
-  }
-}
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-TEST(Layer, SubNestedSequenceLayer) {
-  // layer size is not crutial for this layer,
-  // so use a small layer size in unittest
-  const int layerSize = 4;
-
-  const int maxSeqNum = 50;
-  const int maxSeqLen = 50;
-  const int maxBeamSize = 32;
-
-  srand((size_t)(time(NULL)));
-  int beamSize = 1 + (rand() % maxBeamSize);
-
-  TestConfig config;
-  config.layerConfig.set_type("sub_nested_seq");
-  config.layerConfig.set_name("sub_nested_seq_layer");
-  config.layerConfig.set_size(layerSize);
-
-  int seqNum = 1 + (rand() % maxSeqNum);
-
-  // sequence information for the first input, it is a nested sequence
-  vector<int> seqStartPos(seqNum + 1, 0);
-  vector<int> subSeqStartPos(1, 0);
-
-  // selected indices
-  MatrixPtr selectedIndices = Matrix::create(seqNum, beamSize, false, false);
-  selectedIndices->one();
-  selectedIndices->mulScalar(-1.);
-  real* indicesData = selectedIndices->getData();
-
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqNum = 1 + (rand() % maxSeqNum);
-    for (int j = 0; j < subSeqNum; ++j) {
-      subSeqStartPos.push_back(subSeqStartPos.back() +
-                               (1 + (rand() % maxSeqLen)));
-    }
-    vector<real> selSeqs =
-        randSampling(static_cast<real>(subSeqNum), min(beamSize, subSeqNum));
-    memcpy(indicesData + (i * beamSize),
-           selSeqs.data(),
-           selSeqs.size() * sizeof(real));
-    seqStartPos[i + 1] = subSeqStartPos.back();
-  }
-
-  MatrixPtr seqInputPtr =
-      Matrix::create(seqStartPos.back(), layerSize, false, false);
-  seqInputPtr->randomizeUniform();
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                              "nested_seq_input",
-                              seqInputPtr,
-                              seqStartPos,
-                              subSeqStartPos});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "selected_indices", selectedIndices});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "sub_nested_seq",
-                  /* batchSize */ seqNum,
-                  /* trans */ false,
-                  /* useGpu*/ useGpu,
-                  /* useWeight */ false);
-  }
-}
-
-TEST(Layer, ClipLayer) {
-  const size_t batchSize = 128;
-  const size_t size = 512;
-  TestConfig config;
-  config.layerConfig.set_type("clip");
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ClipConfig* layerConf = input->mutable_clip_conf();
-  double p1 = std::rand() / (double)RAND_MAX;
-  double p2 = std::rand() / (double)RAND_MAX;
-  layerConf->set_min(std::min(p1, p2));
-  layerConf->set_max(std::max(p1, p2));
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "clip", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, RowL2NormLayer) {
-  const size_t batchSize = 128;
-  const size_t size = 512;
-  TestConfig config;
-  config.layerConfig.set_type("row_l2_norm");
-  config.layerConfig.set_size(size);
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
-  }
-}
-
-void test3DConvLayer(const string& type, bool trans, bool useGpu) {
-  // filter size
-  const int NUM_FILTERS = 6;
-  // const int CHANNELS = 3;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-
-  // input image
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 9;
-  const int IMAGE_SIZE_Y = 9;
-  const int IMAGE_SIZE_Z = 9;
-
-  TestConfig config;
-  config.biasSize = NUM_FILTERS;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(NUM_FILTERS);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  // Setting up conv3D-trans layer
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-
-  conv->set_channels(CHANNELS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_filter_size_z(FILTER_SIZE_Z);
-  conv->set_padding(0);
-  conv->set_padding_y(0);
-  conv->set_padding_z(0);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_stride_z(2);
-  conv->set_img_size(IMAGE_SIZE);
-  conv->set_img_size_y(IMAGE_SIZE_Y);
-  conv->set_img_size_z(IMAGE_SIZE_Z);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /*  caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /*  caffeMode */ true));
-  conv->set_output_z(outputSize(conv->img_size_z(),
-                                conv->filter_size_z(),
-                                conv->padding_z(),
-                                conv->stride_z(),
-                                /*  caffeMode */ true));
-
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              conv->output_z() * NUM_FILTERS);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
-       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
-           NUM_FILTERS});
-
-  testLayerGrad(config, "conv3D", 10, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "conv3D", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, test3DConvLayer) {
-  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
-  // filter size
-  const int NUM_FILTERS = 6;
-  // const int CHANNELS = 3;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-
-  // input image
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 4;
-  const int IMAGE_SIZE_Y = 6;
-  const int IMAGE_SIZE_Z = 6;
-
-  // Setting up conv-trans layer
-  TestConfig config;
-  config.biasSize = NUM_FILTERS;
-  config.layerConfig.set_type("deconv3d");
-  config.layerConfig.set_num_filters(NUM_FILTERS);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-
-  conv->set_channels(CHANNELS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_filter_size_z(FILTER_SIZE_Z);
-  conv->set_padding(0);
-  conv->set_padding_y(0);
-  conv->set_padding_z(0);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_stride_z(2);
-  conv->set_output_x(IMAGE_SIZE);
-  conv->set_output_y(IMAGE_SIZE_Y);
-  conv->set_output_z(IMAGE_SIZE_Z);
-
-  conv->set_img_size(imageSize(conv->output_x(),
-                               conv->filter_size(),
-                               conv->padding(),
-                               conv->stride(),
-                               true));
-  conv->set_img_size_y(imageSize(conv->output_y(),
-                                 conv->filter_size_y(),
-                                 conv->padding_y(),
-                                 conv->stride_y(),
-                                 true));
-  conv->set_img_size_z(imageSize(conv->output_z(),
-                                 conv->filter_size_z(),
-                                 conv->padding_z(),
-                                 conv->stride_z(),
-                                 true));
-  config.layerConfig.set_size(conv->img_size() * conv->img_size_y() *
-                              conv->img_size_z() * NUM_FILTERS);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
-       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
-           NUM_FILTERS});
-
-  testLayerGrad(config, "deconv3D", 10, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "deconv3D", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, test3DDeConvLayer) {
-  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-TEST(Layer, ScaleShiftLayer) {
-  // FIXME: Disable ScaleShiftLayer because it is not stable.
-  // https://github.com/PaddlePaddle/Paddle/issues/7781
-  return;
-  //  const size_t batchSize = 16;
-  //  const size_t size = 32;
-  //  TestConfig config;
-  //  config.layerConfig.set_type("scale_shift");
-  //  config.layerConfig.set_size(size);
-  //  config.biasSize = 1;
-  //  config.inputDefs.push_back(
-  //      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
-  //  config.layerConfig.add_inputs();
-  //  for (auto useGpu : {false, true}) {
-  //    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
-  //  }
-}
-
-TEST(Layer, ScaleSubRegionLayer) {
-  const size_t batchSize = 64;
-  const size_t size = 4096;
-  TestConfig config;
-  config.layerConfig.set_type("scale_sub_region");
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
-  auto* data = indicesV->getData();
-  for (size_t i = 0; i < batchSize; ++i) {
-    data[i * 2] = 2;
-    data[i * 2 + 1] = 4;
-    data[i * 2 + 2] = 16;
-    data[i * 2 + 3] = 32;
-    data[i * 2 + 4] = 16;
-    data[i * 2 + 5] = 32;
-  }
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ScaleSubRegionConfig* scaleSubRegionConf =
-      input->mutable_scale_sub_region_conf();
-  ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
-  imgConf->set_img_size(32);
-  imgConf->set_img_size_y(32);
-  imgConf->set_channels(4);
-  scaleSubRegionConf->set_value(2.0);
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, L2DistanceLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("l2_distance");
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-
-  const size_t input_dim = 27;
-  const size_t batch_size = 11;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", input_dim, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", input_dim, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "l2_distance", batch_size, false, useGpu);
-  }
-}
-
-void testFactorizationMachineLayer(InputType type, bool useGpu) {
-  const int FACTOR_SIZE = 10;
-  TestConfig config;
-  config.layerConfig.set_type("factorization_machine");
-  config.layerConfig.set_factor_size(FACTOR_SIZE);
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-  config.inputDefs.push_back({type, "layer_0", 128, 1280});
-  config.layerConfig.add_inputs();
-  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
-}
-
-TEST(Layer, FactorizationMachineLayer) {
-  for (auto useGpu : {false, true}) {
-    testFactorizationMachineLayer(INPUT_DATA, useGpu);
-  }
-  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
deleted file mode 100644
index 423c31e27d7ca223f1cbff8f030b006d3889f0bb..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <vector>
-#include "paddle/gserver/layers/LinearChainCRF.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static inline bool getNextSequence(vector<int>& seq, int numClasses) {
-  for (auto& v : seq) {
-    if (++v < numClasses) {
-      return true;
-    }
-    v = 0;
-  }
-  return false;
-}
-
-TEST(LinearChainCRF, decoding) {
-  const int numClasses = 4;
-  CpuVector para(numClasses * (numClasses + 2));
-  real* a = para.getData();
-  real* b = para.getData() + numClasses;
-  real* w = para.getData() + 2 * numClasses;
-  LinearChainCRF crf(4, para.getData());
-  for (int length : {1, 2, 3, 10}) {
-    for (int tries = 0; tries < 10; ++tries) {
-      CpuMatrix x(length, numClasses);
-      x.randomizeUniform();
-      para.randnorm(0, 2);
-      vector<int> decodingResult(length);
-      vector<int> bestResult(length);
-      vector<int> testResult(length, 0);
-      crf.decode(x.getData(), &decodingResult[0], length);
-      real bestScore = -std::numeric_limits<real>::max();
-      do {
-        real score = a[testResult.front()] + b[testResult.back()];
-        score += x.getElement(0, testResult.front());
-        for (int k = 1; k < length; ++k) {
-          score += x.getElement(k, testResult[k]) +
-                   w[numClasses * testResult[k - 1] + testResult[k]];
-        }
-        if (score > bestScore) {
-          bestScore = score;
-          bestResult = testResult;
-        }
-      } while (getNextSequence(testResult, numClasses));
-      for (int k = 0; k < length; ++k) {
-        EXPECT_EQ(decodingResult[k], bestResult[k]);
-      }
-    }
-  }
-}
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
deleted file mode 100644
index a34a3f6206171fb1e0563ab9ef8550bc890359ce..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ /dev/null
@@ -1,448 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <string>
-#include <vector>
-#include "MKLDNNTester.h"
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/activations/MKLDNNActivation.h"
-#include "paddle/math/MathUtils.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(use_gpu);
-DECLARE_bool(use_mkldnn);
-
-#define RUN_MKLDNN_TEST(DNN_CONFIG, REF_CONFIG, DESC)         \
-  MKLDNNTester tester;                                        \
-  for (auto bs : {DESC.bs, 1}) {                              \
-    tester.run(DNN_CONFIG, REF_CONFIG, bs, DESC.ih, DESC.iw); \
-  }
-
-#define RUN_MKLDNN_TEST_LAYER(DNN_CONFIG, REF_TYPE, DESC) \
-  TestConfig ref = DNN_CONFIG;                            \
-  ref.layerConfig.set_type(REF_TYPE);                     \
-  RUN_MKLDNN_TEST(DNN_CONFIG, ref, DESC)
-
-struct testFcDesc {
-  int bs;
-  int ic;
-  int ih, iw;  // oh == ow == 1
-  int oc;
-};
-
-static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_fc");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_size(pm.oc);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.oc * pm.ic * pm.ih * pm.iw)});
-  cfg.layerConfig.add_inputs();
-}
-
-void testFcLayer(const testFcDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNFcConfig(dnnConfig, pm);
-  for (auto biasSize : {pm.oc, 0}) {
-    dnnConfig.biasSize = biasSize;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "fc", pm)
-  }
-}
-
-TEST(MKLDNNLayer, FcLayer) {
-  /* bs, ic, ih, iw, oc */
-  testFcLayer({2, 2, 1, 1, 3});
-  testFcLayer({3, 7, 1, 1, 19});
-  testFcLayer({8, 16, 13, 13, 32});
-  testFcLayer({4, 12, 13, 13, 18});
-  testFcLayer({2, 64, 16, 16, 32});
-  testFcLayer({15, 3, 16, 16, 6});
-}
-
-struct testConvDesc {
-  int bs, gp;
-  int ic, ih, iw;
-  int oc, oh, ow;
-  int fh, fw;
-  int ph, pw;
-  int sh, sw;
-  int dh, dw;
-};
-
-static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_conv");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_num_filters(pm.oc);
-  cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
-  cfg.layerConfig.set_shared_biases(true);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.oc * pm.ic * pm.fh * pm.fw / pm.gp)});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_groups(pm.gp);
-  conv->set_img_size(pm.iw);
-  conv->set_img_size_y(pm.ih);
-  conv->set_output_x(pm.ow);
-  conv->set_output_y(pm.oh);
-  conv->set_filter_size(pm.fw);
-  conv->set_filter_size_y(pm.fh);
-  conv->set_channels(pm.ic);
-  conv->set_padding(pm.pw);
-  conv->set_padding_y(pm.ph);
-  conv->set_stride(pm.sw);
-  conv->set_stride_y(pm.sh);
-  conv->set_dilation(pm.dw);
-  conv->set_dilation_y(pm.dh);
-  conv->set_caffe_mode(true);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  CHECK_EQ(conv->filter_channels() * pm.gp, conv->channels())
-      << "it is indivisible";
-
-  int fh = (pm.fh - 1) * pm.dh + 1;
-  int fw = (pm.fw - 1) * pm.dw + 1;
-  int ow = outputSize(pm.iw, fw, pm.pw, pm.sw, true);
-  int oh = outputSize(pm.ih, fh, pm.ph, pm.sh, true);
-  CHECK_EQ(ow, pm.ow) << "output size check failed";
-  CHECK_EQ(oh, pm.oh) << "output size check failed";
-}
-
-void testConvLayer(const testConvDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNConvConfig(dnnConfig, pm);
-  for (auto biasSize : {pm.oc, 0}) {
-    dnnConfig.biasSize = biasSize;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "exconv", pm)
-  }
-}
-
-TEST(MKLDNNLayer, ConvLayer) {
-  /* bs, gp, ic, ih, iw, oc, oh, ow, fh, fw, ph, pw, sh, sw, dh, dw */
-  testConvLayer({2, 1, 3, 32, 32, 16, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({2, 1, 8, 16, 16, 8, 16, 16, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({3, 1, 16, 32, 32, 3, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({8, 1, 16, 18, 18, 32, 18, 18, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({16, 1, 1, 42, 31, 32, 23, 11, 4, 5, 3, 2, 2, 3, 1, 1});
-  testConvLayer({2, 1, 8, 16, 16, 8, 8, 8, 3, 3, 1, 1, 2, 2, 1, 1});
-  testConvLayer({3, 1, 8, 13, 13, 8, 7, 7, 3, 3, 1, 1, 2, 2, 1, 1});
-  // with groups
-  testConvLayer({2, 2, 4, 5, 5, 8, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({2, 3, 3, 5, 5, 3, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
-}
-
-struct testPoolDesc {
-  int bs, ic;  // input channel and output channel are the same
-  int ih, iw;
-  int oh, ow;
-  int fh, fw;
-  int ph, pw;
-  int sh, sw;
-};
-
-static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_pool");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       0});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-  pool->set_pool_type("avg-projection");
-  pool->set_channels(pm.ic);
-  pool->set_img_size(pm.iw);
-  pool->set_img_size_y(pm.ih);
-  pool->set_output_x(pm.ow);
-  pool->set_output_y(pm.oh);
-  pool->set_size_x(pm.fw);
-  pool->set_size_y(pm.fh);
-  pool->set_padding(pm.pw);
-  pool->set_padding_y(pm.ph);
-  pool->set_stride(pm.sw);
-  pool->set_stride_y(pm.sh);
-
-  int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false);
-  int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
-  CHECK_EQ(ow, pm.ow) << "output size check failed";
-  CHECK_EQ(oh, pm.oh) << "output size check failed";
-}
-
-void testPoolLayer(const testPoolDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNPoolConfig(dnnConfig, pm);
-  LayerInputConfig* input = dnnConfig.layerConfig.mutable_inputs(0);
-  PoolConfig* pool = input->mutable_pool_conf();
-  for (auto type : {"max-projection", "avg-projection"}) {
-    pool->set_pool_type(type);
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "pool", pm)
-  }
-}
-
-TEST(MKLDNNLayer, PoolLayer) {
-  /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw */
-  testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
-  testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
-  testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1});
-  testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1});
-  testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
-}
-
-struct testBatchNormDesc {
-  int bs;
-  int ic;
-  int ih, iw;
-};
-
-static void getMKLDNNBatchNormConfig(TestConfig& cfg,
-                                     const testBatchNormDesc& pm) {
-  cfg.layerConfig.set_size(pm.ic * pm.ih * pm.iw);
-  cfg.layerConfig.set_type("mkldnn_batch_norm");
-  cfg.biasSize = pm.ic;
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.ic)});
-  cfg.inputDefs.push_back(
-      {INPUT_DATA, "layer_1_moving_mean", 1, size_t(pm.ic)});
-  cfg.inputDefs.back().isStatic = true;
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_2_moving_var", 1, size_t(pm.ic)});
-  cfg.inputDefs.back().isStatic = true;
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.add_inputs();
-  cfg.layerConfig.add_inputs();
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(pm.ic);
-  img_conf->set_img_size_y(pm.ih);
-  img_conf->set_img_size(pm.iw);
-}
-
-void testBatchNormLayer(const testBatchNormDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNBatchNormConfig(dnnConfig, pm);
-  TestConfig refConfig = dnnConfig;
-  refConfig.layerConfig.set_type("batch_norm");
-  // for PASS_TRAIN, use_global_stats always should be false, and batchsize != 1
-  VLOG(MKLDNN_TESTS) << "check train phase";
-  dnnConfig.layerConfig.set_use_global_stats(false);
-  refConfig.layerConfig.set_use_global_stats(false);
-  MKLDNNTester tester;
-  tester.run(dnnConfig, refConfig, pm.bs, pm.ih, pm.iw, PASS_TRAIN);
-  // for PASS_TEST, check use_global_stats true and false, and batchsize 1
-  VLOG(MKLDNN_TESTS) << "check test phase";
-  for (auto useGS : {false, true}) {
-    dnnConfig.layerConfig.set_use_global_stats(useGS);
-    refConfig.layerConfig.set_use_global_stats(useGS);
-    MKLDNNTester tester;
-    for (auto bs : {pm.bs, 1}) {
-      tester.run(dnnConfig, refConfig, bs, pm.ih, pm.iw, PASS_TEST);
-    }
-  }
-}
-
-TEST(MKLDNNLayer, BatchNormLayer) {
-  testBatchNormLayer({4, 10, 6, 6});
-  testBatchNormLayer({16, 32, 16, 16});
-  testBatchNormLayer({4, 16, 8, 10});
-}
-
-struct testLRNDesc {
-  int bs, ic, ih, iw;
-  float scale, pow;
-  int localSize;
-};
-
-void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_lrn");
-  cfg.layerConfig.set_active_type("relu");
-  size_t layerSize = pm.ic * pm.ih * pm.iw;
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_channels(pm.ic);
-  norm->set_size(pm.localSize);
-  norm->set_scale(pm.scale);
-  norm->set_pow(pm.pow);
-  norm->set_blocked(0);
-  norm->set_img_size(pm.iw);
-  norm->set_img_size_y(pm.ih);
-  norm->set_output_x(norm->img_size());
-  norm->set_output_y(norm->img_size_y());
-  cfg.layerConfig.set_size(layerSize);
-  cfg.biasSize = 0;
-}
-
-void testLRNLayer(const testLRNDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNLRNConfig(dnnConfig, pm);
-  // mkldnn_lrn <==> norm with cmrnorm-projection type
-  TestConfig refConfig = dnnConfig;
-  refConfig.layerConfig.set_type("norm");
-  LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type("cmrnorm-projection");
-  norm->set_scale(norm->scale() / norm->size());
-  RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
-}
-
-TEST(MKLDNNLayer, LRNLayer) {
-  testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
-  testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
-  testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
-}
-
-struct testImageDesc {
-  int bs, ic, ih, iw;
-};
-
-static void getAddtoConfig(TestConfig& cfg,
-                           const testImageDesc& pm,
-                           const size_t nInputs = 1) {
-  cfg.biasSize = 0;
-  cfg.layerConfig.set_type("addto");
-  size_t layerSize = pm.ic * pm.ih * pm.iw;
-  cfg.layerConfig.set_size(layerSize);
-  cfg.layerConfig.set_active_type("relu");
-  for (size_t i = 0; i < nInputs; ++i) {
-    std::stringstream ss;
-    ss << "layer_" << i;
-    cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
-    LayerInputConfig* input = cfg.layerConfig.add_inputs();
-    ImageConfig* img_conf = input->mutable_image_conf();
-    img_conf->set_channels(pm.ic);
-    img_conf->set_img_size_y(pm.ih);
-    img_conf->set_img_size(pm.iw);
-  }
-}
-
-void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
-  CHECK_GE(nInputs, 1UL);
-  TestConfig dnnConfig;
-  getAddtoConfig(dnnConfig, pm, nInputs);
-  dnnConfig.layerConfig.set_type("mkldnn_addto");
-  for (auto withBias : {false, true}) {
-    dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
-  }
-}
-
-TEST(MKLDNNLayer, AddtoLayer) {
-  testAddtoLayer({16, 5, 14, 14}, 1);
-  testAddtoLayer({8, 10, 8, 8}, 2);
-  testAddtoLayer({4, 12, 1, 1}, 3);
-}
-
-static void getMKLDNNConcatConfig(TestConfig& cfg,
-                                  const std::vector<testImageDesc>& inputs) {
-  CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
-  int oc = inputs[0].ic;
-  for (size_t i = 1; i < inputs.size(); ++i) {
-    CHECK_EQ(inputs[i].bs, inputs[0].bs);
-    CHECK_EQ(inputs[i].ih, inputs[0].ih);
-    CHECK_EQ(inputs[i].iw, inputs[0].iw);
-    oc += inputs[i].ic;
-  }
-  cfg.biasSize = 0;
-  cfg.layerConfig.set_type("mkldnn_concat");
-  cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
-  cfg.layerConfig.set_active_type("relu");
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    std::stringstream ss;
-    ss << "layer_" << i;
-    cfg.inputDefs.push_back(
-        {INPUT_DATA,
-         ss.str(),
-         (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
-         0});
-    LayerInputConfig* input = cfg.layerConfig.add_inputs();
-    ImageConfig* img_conf = input->mutable_image_conf();
-    img_conf->set_channels(inputs[i].ic);
-    img_conf->set_img_size_y(inputs[i].ih);
-    img_conf->set_img_size(inputs[i].iw);
-  }
-}
-
-void testConcatLayer(const std::vector<testImageDesc>& inputs) {
-  TestConfig dnnConfig;
-  getMKLDNNConcatConfig(dnnConfig, inputs);
-  RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
-}
-
-TEST(MKLDNNLayer, ConcatLayer) {
-  testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
-  testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
-}
-
-void testActivation(std::string actType, const testImageDesc& pm) {
-  // TODO(TJ): remove me when paddle support elu activation
-  if (actType == "mkldnn_elu") {
-    return;
-  }
-  const std::string compareTypes[] = {actType, actType.erase(0, 7)};
-  TestConfig cfg;
-  getAddtoConfig(cfg, pm);
-  TestConfig ref = cfg;
-  cfg.layerConfig.set_active_type(compareTypes[0]);
-  ref.layerConfig.set_active_type(compareTypes[1]);
-  RUN_MKLDNN_TEST(cfg, ref, pm)
-}
-
-TEST(MKLDNNActivation, Activations) {
-  auto types = MKLDNNActivation::getAllRegisteredTypes();
-  for (auto type : types) {
-    /* bs, c, h, w*/
-    testActivation(type, {16, 64, 32, 32});
-    testActivation(type, {2, 8, 1, 1});
-  }
-}
-
-DECLARE_string(config_args);
-TEST(MKLDNNNet, net) {
-  std::vector<std::string> cases = {"simple", "branch"};
-  for (auto name : cases) {
-    std::string config = "./gserver/tests/mkldnn_" + name + "_net.conf";
-    for (auto channels : {2, 32}) {
-      std::ostringstream oss;
-      oss << "channels=" << channels;
-      FLAGS_config_args = oss.str();
-      MKLDNNTester::runNetTest(config);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  FLAGS_use_gpu = false;
-  FLAGS_use_mkldnn = true;
-  initMain(argc, argv);
-  initPython(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp b/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
deleted file mode 100644
index 5188d2abed899a210de66084109034ee381cd078..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;
-
-void setPoolConfig(TestConfig* config,
-                   PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(1);
-
-  int kw = 3, kh = 3;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(1);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
-                                       const string& poolType,
-                                       bool use_gpu,
-                                       MatrixPtr& maskMat) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(5);
-  pool->set_img_size_y(5);
-  setPoolConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  config.layerConfig.set_name("MaxPoolWithMask");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-
-  initDataLayer(config,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                "MaxPoolWithMask",
-                1,
-                false,
-                use_gpu);
-
-  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
-
-  FLAGS_use_gpu = use_gpu;
-  std::vector<ParameterPtr> parameters;
-  LayerPtr maxPoolingWithMaskOutputLayer;
-  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
-  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
-
-  checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
-                   maskMat);
-}
-
-TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
-  bool useGpu = false;
-  MatrixPtr inputMat;
-  MatrixPtr maskMat;
-  real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
-                      0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
-                      0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
-  real maskData[] = {12, 4, 22, 24};
-
-  inputMat = Matrix::create(1, 25, false, useGpu);
-  maskMat = Matrix::create(1, 4, false, useGpu);
-  inputMat->setData(inputData);
-  maskMat->setData(maskData);
-  doOneMaxPoolingWithMaskOutputTest(
-      inputMat, "max-pool-with-mask", useGpu, maskMat);
-#ifdef PADDLE_WITH_CUDA
-  useGpu = true;
-  inputMat = Matrix::create(1, 25, false, useGpu);
-  maskMat = Matrix::create(1, 4, false, useGpu);
-  inputMat->copyFrom(inputData, 25);
-  maskMat->copyFrom(maskData, 4);
-  doOneMaxPoolingWithMaskOutputTest(
-      inputMat, "max-pool-with-mask", useGpu, maskMat);
-#endif
-}
diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp
deleted file mode 100644
index 043025239e744601cbef3ca5c241509872963bd8..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_MultinomialSampler.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include <gtest/gtest.h>
-#include <vector>
-
-#undef PADDLE_DISABLE_TIMER
-#include "paddle/utils/Stat.h"
-
-#include "paddle/gserver/layers/MultinomialSampler.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-class MultinomialSamplerTester : public MultinomialSampler {
- public:
-  MultinomialSamplerTester(real* prob, int size)
-      : MultinomialSampler(prob, size) {}
-
-  template <typename Rand1>
-  int testGen(Rand1 rand1) {
-    return gen1(rand1);
-  }
-};
-
-TEST(MultinomialSampler, gen) {
-  int numGrids = 1024 * 1024;
-  int size = 1024 * 4;
-  default_random_engine reng;
-
-  for (size_t iter = 0; iter < 256; ++iter) {
-    uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
-    vector<real> prob;
-    int sum = 0;
-    for (int i = 0; i < size; ++i) {
-      prob.push_back(rand(reng));
-      sum += prob.back();
-    }
-
-    CHECK_LE(sum, numGrids);
-    prob.back() += numGrids - sum;
-
-    vector<int> counts(size);
-    MultinomialSamplerTester sampler(&prob[0], size);
-    counts.assign(size, 0);
-    {
-      double s = (double)size / (double)numGrids;
-      REGISTER_TIMER("MultinomialSampler");
-      for (double i = 0; i < numGrids; ++i) {
-        int ret = sampler.testGen([i, s]() { return s * i; });
-        if (ret < 0 || ret >= size) {
-          EXPECT_GE(ret, 0);
-          EXPECT_LT(ret, size);
-          break;
-        }
-        ++counts[ret];
-      }
-    }
-    for (int i = 0; i < size; ++i) {
-      if (prob[i] != counts[i]) {
-        EXPECT_EQ(prob[i], counts[i]);
-        LOG(INFO) << iter;
-        break;
-      }
-    }
-  }
-}
-
-void benchmarkRandom() {
-  int n = 1024 * 1024;
-
-  int sum;
-  double sum1;
-
-  sum = 0;
-  unsigned int seed = 1;
-  {
-    REGISTER_TIMER("crand");
-    for (int i = 0; i < n; ++i) {
-      sum += rand_r(&seed) % 1000;
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  default_random_engine reng;
-  uniform_int_distribution<int> rand(1, 1000);
-  sum = 0;
-  {
-    REGISTER_TIMER("stdrand");
-    for (int i = 0; i < n; ++i) {
-      sum += rand(reng);
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  sum = 0;
-  {
-    REGISTER_TIMER("default_random_engine");
-    for (int i = 0; i < n; ++i) {
-      sum += reng();
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  uniform_real_distribution<double> rand1(0, 1);
-  sum1 = 0;
-  {
-    REGISTER_TIMER("stdrand1");
-    for (int i = 0; i < n; ++i) {
-      sum1 += rand1(reng);
-    }
-  }
-  LOG(INFO) << "sum1=" << sum1;
-
-  sum1 = 0;
-  {
-    real a = 1.0f / (real)RAND_MAX;
-    REGISTER_TIMER("crand1");
-    for (int i = 0; i < n; ++i) {
-      sum1 += a * rand_r(&seed);
-    }
-  }
-  LOG(INFO) << "sum1=" << sum1;
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  benchmarkRandom();
-  int ret = RUN_ALL_TESTS();
-  globalStat.printSegTimerStatus();
-  return ret;
-}
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
deleted file mode 100644
index fda3f2f7934adde09303f443ca5e8de6a7d077cd..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#undef PADDLE_DISABLE_TIMER
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-
-#include "paddle/testing/TestUtil.h"
-#include "paddle/trainer/Trainer.h"
-#include "paddle/utils/Stat.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DEFINE_bool(use_label, true, "input label or sequence label");
-DEFINE_bool(static_para, false, "static parameter");
-
-struct DataIn {
-  std::vector<Argument> inArgs;
-  std::vector<MatrixPtr> outGrads;
-  std::vector<VectorPtr> paraValues;
-};
-
-struct DataOut {
-  std::vector<MatrixPtr> outValues;
-  std::vector<VectorPtr> paraGrads;
-};
-
-void initArgument(DataIn& data,
-                  const std::string& configPath,
-                  bool useGpu = FLAGS_use_gpu) {
-  TrainerConfigHelper config(configPath);
-  size_t batchSize = config.getOptConfig().batch_size();
-
-  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    Argument arg;
-    arg.value = Matrix::create(batchSize, layerSize, false, useGpu);
-    arg.grad = Matrix::create(batchSize, layerSize, false, useGpu);
-    arg.value->randomizeUniform();
-    arg.value->add(-0.5);
-    arg.value->sigmoid(*arg.value);
-    arg.grad->zeroMem();
-    if (FLAGS_use_label) {
-      arg.ids = VectorT<int>::create(batchSize, useGpu);
-      arg.ids->rand(layerSize);
-    }
-    generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
-    data.inArgs.push_back(arg);
-  }
-
-  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    MatrixPtr grad = Matrix::create(batchSize, layerSize, false, useGpu);
-    grad->randomizeUniform();
-    data.outGrads.push_back(grad);
-  }
-
-  for (const auto& para_config : config.getModelConfig().parameters()) {
-    VectorPtr value = Vector::create(para_config.size(), useGpu);
-    value->randnorm(0, 2);
-    data.paraValues.push_back(value);
-  }
-}
-
-void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-
-  Trainer trainer;
-  auto config = std::make_shared<TrainerConfigHelper>(configPath);
-  trainer.init(config, false);
-
-  std::vector<ParameterPtr> parameters;
-  vector<Argument> outArgs;
-
-  auto gradientMachine = trainer.getGradientMachine();
-  parameters = gradientMachine->getParameters();
-  if (FLAGS_static_para) {
-    for (size_t i = 0; i < parameters.size(); i++) {
-      parameters[i]->getBuf(PARAMETER_VALUE)->one();
-    }
-  } else {
-    for (size_t i = 0; i < in.paraValues.size(); i++) {
-      parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
-    }
-  }
-  gradientMachine->start();
-  gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
-  for (size_t i = 0; i < in.outGrads.size(); i++) {
-    // If the all the layers in the config have no parameters, also
-    // not set NeedGradient(), the outArgs[i] will be nullptr.
-    outArgs[i].grad->copyFrom(*in.outGrads[i]);
-  }
-  gradientMachine->backward();
-  for (size_t i = 0; i < in.outGrads.size(); i++) {
-    MatrixPtr value = Matrix::create(outArgs[i].value->getHeight(),
-                                     outArgs[i].value->getWidth(),
-                                     false,
-                                     false);
-    value->copyFrom(*outArgs[i].value);
-    out.outValues.push_back(value);
-  }
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    VectorPtr grad = Vector::create(
-        parameters[i]->getBuf(PARAMETER_GRADIENT)->getSize(), false);
-    grad->copyFrom(*parameters[i]->getBuf(PARAMETER_GRADIENT));
-    out.paraGrads.push_back(grad);
-  }
-
-  for (int i = 0; i < 20; i++) {
-    REGISTER_TIMER("forward");
-    gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
-  }
-  for (int i = 0; i < 20; i++) {
-    REGISTER_TIMER("backward");
-    gradientMachine->backward();
-  }
-
-  gradientMachine->finish();
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    if (diff > 0.0f &&
-        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_checkgrad_eps) {
-      nNum++;
-      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
-                << "    " << desB << " : " << B[i];
-    }
-  }
-  EXPECT_EQ(0, nNum);
-}
-
-void compareGradient(DataOut& outA, DataOut& outB) {
-  LOG(INFO) << "------------------------------"
-            << " Check Network Output "
-            << "------------------------------";
-  for (size_t i = 0; i < outA.outValues.size(); ++i) {
-    LOG(INFO) << "OUTPUT VALUE: " << i;
-    checkBuffer(outA.outValues[i]->getData(),
-                "network A output",
-                outB.outValues[i]->getData(),
-                "network B output",
-                outA.outValues[i]->getElementCnt(),
-                outA.outValues[i]->getWidth());
-  }
-
-  if (!FLAGS_static_para) {
-    LOG(INFO) << "------------------------------"
-              << " Check Parameters "
-              << "------------------------------";
-    for (size_t i = 0; i < outA.paraGrads.size(); ++i) {
-      LOG(INFO) << "PARAMETER GRADIENT: " << i;
-      checkBuffer(outA.paraGrads[i]->getData(),
-                  "Network A",
-                  outB.paraGrads[i]->getData(),
-                  "Network B",
-                  outA.paraGrads[i]->getSize());
-    }
-  }
-}
-
-void compareNetwork(const std::string& config_file_a,
-                    const std::string& config_file_b) {
-  DataIn in;
-  initArgument(in, config_file_a);
-
-  DataOut dataA;
-  calcGradient(in, dataA, config_file_a);
-  LOG(INFO) << "forwardBackward of Network A is finished";
-  globalStat.printSegTimerStatus();
-  globalStat.reset();
-  LOG(INFO) << "\n\n";
-
-  DataOut dataB;
-  calcGradient(in, dataB, config_file_b);
-  LOG(INFO) << "forwardBackward of the Network B is finished";
-  globalStat.printSegTimerStatus();
-  globalStat.reset();
-  LOG(INFO) << "\n\n";
-
-  compareGradient(dataA, dataB);
-}
-
-TEST(Compare, concat_dotmul) {
-  std::string config_file_a = "./gserver/tests/concat_dotmul_a.conf";
-  std::string config_file_b = "./gserver/tests/concat_dotmul_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_fullmatrix) {
-  std::string config_file_a = "./gserver/tests/concat_fullmatrix_a.conf";
-  std::string config_file_b = "./gserver/tests/concat_fullmatrix_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_table) {
-  std::string config_file_a = "./gserver/tests/concat_table_a.conf";
-  std::string config_file_b = "./gserver/tests/concat_table_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_slice) {
-  std::string config_file_a = "./gserver/tests/concat_slice_a.conf";
-  std::string config_file_b = "./gserver/tests/concat_slice_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(Compare, img_pool) {
-  std::string config_file_a = "./gserver/tests/img_pool_a.conf";
-  std::string config_file_b = "./gserver/tests/img_pool_b.conf";
-  bool useGpu = FLAGS_use_gpu;
-  FLAGS_use_gpu = true;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-}
-
-TEST(Compare, img_conv) {
-  std::string config_file_a = "./gserver/tests/img_conv_a.conf";
-  std::string config_file_b = "./gserver/tests/img_conv_b.conf";
-  bool useGpu = FLAGS_use_gpu;
-  FLAGS_use_gpu = true;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-}
-
-// Test cudnn_conv and exconv give the same result
-TEST(Compare, img_conv2) {
-  std::string config_file_a = "./gserver/tests/img_conv_cudnn.py";
-  std::string config_file_b = "./gserver/tests/img_conv_exconv.py";
-  bool useGpu = FLAGS_use_gpu;
-  double eps = FLAGS_checkgrad_eps;
-  FLAGS_use_gpu = true;
-  // Sometimes, this unit test will fail with 1e-2
-  FLAGS_checkgrad_eps = 4e-2;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-  FLAGS_checkgrad_eps = eps;
-}
-#endif
-
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
-TEST(Compare, network) {
-  if (FLAGS_config_file_a != "" && FLAGS_config_file_b != "") {
-    compareNetwork(FLAGS_config_file_a, FLAGS_config_file_b);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  initPython(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
deleted file mode 100644
index a1dee9795077b835392469b5085e9728679a1664..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-
-#include "paddle/gserver/dataproviders/PyDataProvider.h"
-#include "paddle/utils/Util.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace std;     // NOLINT
-using namespace paddle;  // NOLINT
-
-void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu);
-void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num);
-
-TEST(PyDataProvider, py_fill_slots) {
-  DataConfig config;
-  config.set_type("py");
-  config.set_async_load_data(false);
-  config.set_load_data_module(std::string("pyDataProvider"));
-  config.set_load_data_object(std::string("SimpleDataProvider"));
-  config.clear_files();
-  std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
-  config.set_files(dataFile);
-#ifndef PADDLE_WITH_CUDA
-  bool useGpu = false;
-#else
-  bool useGpu = true;
-#endif
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  DataBatch dataBatch;
-  dataProvider->getNextBatchInternal(2, &dataBatch);
-  const std::vector<Argument>& argumentList = dataBatch.getStreams();
-  // Check size
-  EXPECT_EQ(argumentList.size(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getHeight(), 2UL);
-  EXPECT_EQ(argumentList[0].value->getElementCnt(), 6UL);
-  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
-  EXPECT_EQ(argumentList[1].value->getHeight(), 2UL);
-  EXPECT_EQ(argumentList[1].value->getElementCnt(), 4UL);
-  EXPECT_EQ(argumentList[2].ids->getSize(), 2UL);
-  // Check value
-  simpleValueCheck(argumentList, useGpu);
-  // Check sequenceStartPositions
-  simpleSequenceCheck(argumentList, 2);
-}
-
-TEST(PyDataProvider, py_fill_nest_slots) {
-  DataConfig config;
-  config.set_type("py");
-  config.set_async_load_data(false);
-  config.set_load_data_module(std::string("pyDataProvider"));
-  config.set_load_data_object(std::string("SimpleNestDataProvider"));
-  config.clear_files();
-  std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
-  config.set_files(dataFile);
-  EXPECT_EQ(config.IsInitialized(), true);
-#ifndef PADDLE_WITH_CUDA
-  bool useGpu = false;
-#else
-  bool useGpu = true;
-#endif
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  DataBatch dataBatch;
-  dataProvider->getNextBatchInternal(2, &dataBatch);
-  const std::vector<Argument>& argumentList = dataBatch.getStreams();
-  // Check size
-  EXPECT_EQ(argumentList.size(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getHeight(), 4UL);
-  EXPECT_EQ(argumentList[0].value->getElementCnt(), 12UL);
-  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
-  EXPECT_EQ(argumentList[1].value->getHeight(), 4UL);
-  EXPECT_EQ(argumentList[1].value->getElementCnt(), 8UL);
-  EXPECT_EQ(argumentList[2].ids->getSize(), 4UL);
-  // Check value
-  simpleValueCheck(argumentList, useGpu);
-  // Check sequenceStartPositions
-  simpleSequenceCheck(argumentList, 4);
-  // Check subSequenceStartPositions
-  EXPECT_EQ(argumentList[0].subSequenceStartPositions->getSize(), 4UL);
-  EXPECT_EQ(argumentList[1].subSequenceStartPositions->getSize(), 3UL);
-  EXPECT_EQ(argumentList[2].subSequenceStartPositions->getSize(), 4UL);
-  for (size_t i = 0; i < argumentList.size(); i++) {
-    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(0), 0);
-    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(1), 1);
-    if (i != 1) {
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 2);
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(3), 4);
-    } else {
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 4);
-    }
-  }
-}
-
-void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu) {
-  // Dense
-  real* data;
-  if (useGpu) {
-    MatrixPtr cpuMatrixPtr = Matrix::create(argumentList[0].value->getHeight(),
-                                            argumentList[0].value->getWidth(),
-                                            0,
-                                            0);
-    cpuMatrixPtr->copyFrom(*argumentList[0].value);
-    data = cpuMatrixPtr->getData();
-  } else {
-    data = argumentList[0].value->getData();
-  }
-  for (size_t i = 0; i < argumentList[0].value->getElementCnt(); ++i) {
-    EXPECT_EQ(*(data + i), (float)(i % 3 + 1));
-  }
-  // Sparse without value
-  GpuSparseMatrixPtr matGpu;
-  CpuSparseMatrixPtr matCpu;
-  if (useGpu) {
-    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(argumentList[1].value);
-    ASSERT_TRUE(matGpu != NULL);
-  } else {
-    data = argumentList[0].value->getData();
-    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(argumentList[1].value);
-    ASSERT_TRUE(matCpu != NULL);
-  }
-  for (size_t i = 0; i < argumentList[1].value->getHeight(); ++i) {
-    size_t colNum = useGpu ? matGpu->getColNum(i) : matCpu->getColNum(i);
-    EXPECT_EQ(colNum, (size_t)2);
-    const int* buf = useGpu ? matGpu->getRowCols(i) : matCpu->getRowCols(i);
-    for (size_t j = 0; j < colNum; ++j) {
-      EXPECT_EQ((size_t)buf[j], (size_t)(j + 1));
-    }
-  }
-  // Index
-  for (size_t j = 0; j < argumentList[2].ids->getSize(); ++j) {
-    EXPECT_EQ((size_t)argumentList[2].ids->get(j), 0UL);
-  }
-}
-
-void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num) {
-  EXPECT_EQ(argumentList[0].sequenceStartPositions->getSize(), 3UL);
-  EXPECT_EQ(argumentList[1].sequenceStartPositions->getSize(), 2UL);
-  EXPECT_EQ(argumentList[2].sequenceStartPositions->getSize(), 3UL);
-  for (size_t i = 0; i < argumentList.size(); i++) {
-    EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(0), 0);
-    if (i != 1) {
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1), 1);
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(2),
-                sample_num);
-    } else {
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1),
-                sample_num);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
deleted file mode 100644
index b39fb3534509ebde2702c02e35800fe3ed6016c3..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ /dev/null
@@ -1,409 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_NO_PYTHON
-#include <gtest/gtest.h>
-#include <fstream>
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_string(train_list, "unittest.list", "file list for unittest");
-
-namespace paddle {
-namespace unittest {
-namespace pydp2 {
-extern void setOnPoolFilledHook(const std::function<void(size_t)> &func);
-extern void clearOnPoolFilledHook();
-
-}  // namespace pydp2
-}  // namespace unittest
-}  // namespace paddle
-
-const paddle::real epsilon = 1e-5;
-
-static inline int64_t readDataBatch(paddle::DataBatch *batch,
-                                    const std::string &funcName,
-                                    int64_t batchSize = 65535) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object(funcName);
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  return provider->getNextBatchInternal(batchSize, batch);
-}
-
-TEST(PyDataProvider2, dense_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_dense_no_seq");
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-
-  paddle::DataBatch batch;
-  for (size_t pass = 0; pass < 2; ++pass) {  // read 2 passes
-    provider->reset();
-    int64_t num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_NE(num, 0);
-    ASSERT_EQ((size_t)batch.getStreams().size(), (size_t)1);
-    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
-    // Check batch data.
-    for (size_t i = 0; i < 100; ++i) {
-      for (size_t j = 0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j - 100.0) * (i + 1) / 200.0);
-        ASSERT_NEAR(
-            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
-      }
-    }
-
-    num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_NE(num, 0);
-    ASSERT_EQ(batch.getStreams().size(), (size_t)1);
-    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
-    // Check batch data.
-    for (size_t i = 0; i < 100; ++i) {
-      size_t ii = i + 100;
-      for (size_t j = 0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j - 100.0) * (ii + 1) / 200.0);
-        ASSERT_NEAR(
-            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
-      }
-    }
-    num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_EQ(num, 0);
-  }
-}
-
-TEST(PyDataProvider2, index_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_index_no_seq");
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-  paddle::DataBatch batch;
-  for (size_t pass = 0; pass < 2; ++pass) {
-    provider->reset();
-    int64_t num = provider->getNextBatchInternal(10000, &batch);
-    CHECK_EQ(num, 200);
-    for (int i = 0; i < 200; ++i) {
-      CHECK_EQ(i, batch.getStreams()[0].ids->getData()[i]);
-    }
-  }
-}
-
-TEST(PyDataProvider2, init_hook) {
-  paddle::PyObjectPtr pickle = paddle::py::import("pickle");
-  paddle::PyObjectPtr globals(PyModule_GetDict(PyImport_AddModule("__main__")));
-  PyDict_SetItemString(globals.get(), "pickle", pickle.get());
-  paddle::PyObjectPtr locals(PyDict_New());
-  paddle::PyObjectPtr mdl(PyRun_String(
-      "dumps = pickle.dumps({'value':[float(x) for x in xrange(20)]})",
-      Py_file_input,
-      globals.get(),
-      locals.get()));
-  CHECK_PY(mdl) << "Error!";
-  paddle::PyObjectPtr dps(PyDict_GetItemString(locals.get(), "dumps"));
-  CHECK_PY(dps) << "Error!";
-
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_init_hook");
-  config.set_load_data_args(PyString_AsString(dps.get()));
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-  provider->reset();
-  paddle::DataBatch batch;
-  int64_t num = provider->getNextBatchInternal(100000, &batch);
-  ASSERT_EQ(num, 200);
-  auto &mat = batch.getStreams()[0].value;
-  ASSERT_EQ((size_t)mat->getWidth(), (size_t)20);
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < 20; ++j) {
-      ASSERT_NEAR((paddle::real)j, mat->getData()[i * 20 + j], epsilon);
-    }
-  }
-}
-
-TEST(PyDataProvider2, sparse_no_value_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_sparse_non_value_no_seq");
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batch;
-  int64_t num = provider->getNextBatchInternal(10000, &batch);
-  CHECK_EQ(num, 200);
-  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
-      batch.getStreams()[0].value);
-  CHECK(csm != nullptr);
-  for (int i = 0; i < 200; ++i) {
-    CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int *cols = csm->getRowCols(i);
-    for (int j = 0; j < 10; ++j) {
-      CHECK_EQ(cols[j], (i + 1) * (j + 1));
-    }
-  }
-}
-
-TEST(PyDataProvider2, sparse_value_no_seq) {
-  paddle::DataBatch batch;
-  CHECK_EQ(readDataBatch(&batch, "test_sparse_value_no_seq"), 200);
-  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
-      batch.getStreams()[0].value);
-  CHECK(csm != nullptr);
-  for (int i = 0; i < 200; ++i) {
-    CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int *cols = csm->getRowCols(i);
-    real *dat = csm->getRowValues(i);
-    for (int j = 0; j < 10; ++j) {
-      EXPECT_EQ(cols[j], (i + 1) * (j + 1));
-      EXPECT_EQ(dat[j], real(j) / real(i + 1));
-    }
-  }
-}
-
-TEST(PyDataProvider2, index_seq) {
-  paddle::DataBatch batch;
-  CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200);
-  auto &arg = batch.getStreams()[0];
-  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 / 2);
-  size_t tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {  // CHECK DATA CORRECT
-    for (size_t j = 0; j < i + 1; ++j) {
-      ASSERT_EQ((size_t)arg.ids->getData()[tmp], j);
-      ++tmp;
-    }
-  }
-  ASSERT_EQ(arg.sequenceStartPositions->getSize(), (size_t)201);
-  tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    tmp += i;
-    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i], tmp);
-  }
-  tmp += 200;
-  ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[200], tmp);
-}
-
-TEST(PyDataProvider2, index_sub_seq) {
-  paddle::DataBatch batch;
-  ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200);
-  auto &arg = batch.getStreams()[0];
-  size_t tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < i + 1; ++j) {
-      for (size_t k = 0; k < j + 1; ++k) {
-        CHECK_EQ((size_t)arg.ids->getData()[tmp++], k);
-      }
-    }
-  }
-
-  CHECK_EQ(tmp, arg.ids->getSize());
-
-  ASSERT_EQ((size_t)arg.sequenceStartPositions->getSize(), (size_t)201);
-  ASSERT_EQ(arg.subSequenceStartPositions->getData(false)[0], 0);
-  ASSERT_EQ(arg.sequenceStartPositions->getData(false)[0], 0);
-  size_t idx = 1;
-  tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < i + 1; ++j) {
-      tmp += j + 1;
-      ASSERT_EQ((size_t)arg.subSequenceStartPositions->getData(false)[idx],
-                (size_t)tmp);
-      ++idx;
-    }
-    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i + 1], tmp);
-  }
-}
-
-TEST(PyDataProvider2, min_pool_size) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_min_pool_size");
-  config.set_load_data_args("");
-  size_t totalData = 1 << 14;
-  constexpr size_t batchSize = 100;
-  constexpr size_t minPoolSize = 1000;
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-
-  paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) {
-    if (totalData > batchSize) {
-      CHECK_GE(poolSize, std::min(totalData - batchSize, minPoolSize));
-    }
-  });
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (realBatchSize) {
-      totalData -= realBatchSize;
-    } else {
-      break;
-    }
-  }
-  paddle::unittest::pydp2::clearOnPoolFilledHook();
-}
-
-TEST(PyDataProvider2, can_over_batch_size) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_can_over_batch_size");
-  config.set_load_data_args("");
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  constexpr size_t batchSize = 100;
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (realBatchSize) {
-      CHECK_LE(static_cast<size_t>(realBatchSize), batchSize);
-    } else {
-      break;
-    }
-  }
-}
-
-TEST(PyDataProvider2, input_order) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_input_order");
-  config.set_load_data_args("");
-
-  paddle::ModelConfig modelConfig;
-  *modelConfig.add_input_layer_names() = "input1";
-  *modelConfig.add_input_layer_names() = "input2";
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, modelConfig, false));
-  provider->reset();
-  constexpr size_t batchSize = 100;
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (!realBatchSize) {
-      break;
-    }
-    ASSERT_EQ(batch.getStreams().size(), static_cast<size_t>(2));
-    for (int64_t i = 0; i < realBatchSize; ++i) {
-      ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
-      ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
-    }
-  }
-}
-
-TEST(PyDataProvider2, test_check) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_check");
-  config.set_load_data_args("");
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(100, &batch);
-    if (!realBatchSize) {
-      break;
-    } else {
-      auto &ivec = batch.getStream(0).ids;
-      for (size_t i = 0; i < ivec->getSize(); ++i) {
-        CHECK_LT(ivec->getData()[i], 10);
-      }
-    }
-  }
-}
-
-TEST(PyDataProvider2, multiThread) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_dense_no_seq");
-  config.set_async_load_data(true);
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  paddle::DataBatch batch;
-  provider->getNextBatch(100, &batch);
-  provider->reset();
-  provider.reset();
-}
-
-TEST(PyDataProvider2, minPoolSizeWithCache) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_min_pool_size_with_cache");
-  config.set_async_load_data(true);
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  paddle::DataBatch batch;
-
-  for (int i = 0; i < 10; ++i) {
-    provider->reset();
-    int64_t sum = 0;
-    while (int64_t actualNum = provider->getNextBatch(100, &batch)) {
-      sum += actualNum;
-    }
-    ASSERT_EQ(1 << 20, sum);
-  }
-}
-
-int main(int argc, char **argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-
-  std::ofstream fout(FLAGS_train_list);
-  CHECK(fout.is_open());
-  fout << "stub file name" << std::endl;  // in unittest, filename is not used.
-  fout.close();
-
-  return RUN_ALL_TESTS();
-}
-
-#endif
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
deleted file mode 100644
index 9770567b88a2af946b30439300540ed61694ba10..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/gserver/gradientmachines/GradientMachine.h>
-#include <paddle/parameter/ParameterUpdateFunctions.h>
-#include <paddle/trainer/Trainer.h>
-#include <paddle/trainer/TrainerInternal.h>
-#include <paddle/utils/PythonUtil.h>
-#include <paddle/utils/Util.h>
-#include <paddle/utils/Version.h>
-
-DECLARE_int32(seed);
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-class TrainerForTest : public paddle::Trainer {
- public:
-  void startTrain() {
-    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
-    gm.start();
-  }
-
-  void finishTrain() {
-    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
-    gm.finish();
-  }
-
-  /**
-   * Get total dimension of all parameters.
-   *
-   * @return the total dimension of all parameters
-   */
-  size_t getTotalParameterSize() const {
-    auto p = const_cast<TrainerForTest*>(this);
-    auto& params = p->getGradientMachine()->getParameters();
-    return std::accumulate(
-        params.begin(), params.end(), 0UL, [](size_t a, const ParameterPtr& p) {
-          return a + p->getSize();
-        });
-  }
-};
-
-void CalCost(const string& conf,
-             const string& dir,
-             real* cost,
-             int num_passes) {
-  auto config = std::make_shared<TrainerConfigHelper>(conf);
-  TrainerForTest trainer;
-  trainer.init(config);
-  mkDir(dir.c_str());
-  config->setSaveDir(dir);
-  auto dataProvider = trainer.getDataProvider();
-  int32_t batchSize = config->getOptConfig().batch_size();
-  real learningRate = config->getOptConfig().learning_rate();
-  real momentum = 0;
-  real decayRate = 0;
-  int64_t dim = trainer.getTotalParameterSize();
-  CpuVector vecW(dim);
-  CpuVector vecGradient(dim);
-  CpuVector vecMomentum(dim);
-
-  // vecW needs to be assigned, otherwise the variable is an uncertain value.
-
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  vecW.randnorm(0, 0.1);
-  vecMomentum.randnorm(0, 0.1);
-
-  trainer.startTrain();
-  for (int i = 0; i < num_passes; ++i) {
-    real totalCost = 0;
-    dataProvider->reset();
-    while (true) {
-      DataBatch dataBatch;
-      int num = dataProvider->getNextBatch(batchSize, &dataBatch);
-      if (num == 0) break;
-      totalCost += trainer.calcGradient(dataBatch, vecW, vecGradient);
-      sgdUpdate(
-          learningRate, momentum, decayRate, &vecW, &vecGradient, &vecMomentum);
-    }
-    cost[i] = totalCost;
-  }
-  trainer.finishTrain();
-  rmDir(dir.c_str());
-}
-
-void test(const string& conf1, const string& conf2, double eps, bool useGpu) {
-  if (!paddle::version::isWithGpu() && useGpu) {
-    return;
-  }
-  FLAGS_use_gpu = useGpu;
-  int num_passes = 5;
-  real* cost1 = new real[num_passes];
-  const string dir1 = "gserver/tests/t1";
-  CalCost(conf1, dir1, cost1, num_passes);
-
-  real* cost2 = new real[num_passes];
-  const string dir2 = "gserver/tests/t2";
-  CalCost(conf2, dir2, cost2, num_passes);
-
-  for (int i = 0; i < num_passes; i++) {
-    LOG(INFO) << "num_passes: " << i << ", cost1=" << cost1[i]
-              << ", cost2=" << cost2[i]
-              << ", diff=" << std::abs(cost1[i] - cost2[i]);
-    ASSERT_NEAR(cost1[i], cost2[i], eps);
-  }
-  delete[] cost1;
-  delete[] cost2;
-}
-
-TEST(RecurrentGradientMachine, HasSubSequence) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_layer_group.conf",
-         "gserver/tests/sequence_nest_layer_group.conf",
-         1e-5,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_rnn.conf",
-         "gserver/tests/sequence_nest_rnn.conf",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_multi_input) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_rnn_multi_input.conf",
-         "gserver/tests/sequence_nest_rnn_multi_input.conf",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_rnn_multi_unequalength_inputs.py",
-         "gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_mixed_input) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_rnn_mixed_inputs.py",
-         "gserver/tests/sequence_rnn_matched_inputs.py",
-         1e-6,
-         useGpu);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-
-  if (paddle::version::isWithPyDataProvider()) {
-    if (!paddle::version::isWithGpu()) {
-      FLAGS_use_gpu = false;
-    }
-    initMain(argc, argv);
-    initPython(argc, argv);
-    return RUN_ALL_TESTS();
-  } else {
-    return 0;
-  }
-}
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
deleted file mode 100644
index b54e37b7dbf8bffeb949f709e6a4f9ec86ea13c3..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ /dev/null
@@ -1,571 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/Version.h>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/Layer.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-DECLARE_bool(use_gpu);
-DECLARE_bool(rnn_use_batch);
-DECLARE_int32(fixed_seq_length);
-
-void checkError(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkError(const CpuVector& vector1, const CpuVector& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int size = vector1.getSize();
-  const real* data1 = vector1.getData();
-  const real* data2 = vector2.getData();
-  int count = 0;
-  for (int i = 0; i < size; i++) {
-    if (fabs(data1[i] - data2[i]) > err) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-LayerPtr creatDataLayer(string name,
-                        size_t batchSize,
-                        int layerSize,
-                        bool useGpu) {
-  LayerConfig dataConfig;
-  dataConfig.set_name(name);
-  dataConfig.set_type("data");
-  dataConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
-
-  Argument data;
-  data.value = Matrix::create(batchSize, layer->getSize(), false, useGpu);
-  data.grad = Matrix::create(batchSize, layer->getSize(), false, useGpu);
-  data.value->randomizeUniform();
-  data.value->add(-0.5);
-  data.value->sigmoid(*data.value);
-  data.grad->zeroMem();
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-ParameterPtr creatParameter(string name,
-                            int pid,
-                            size_t paraSize,
-                            bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
-  parameter->enableType(PARAMETER_VALUE);
-  parameter->enableType(PARAMETER_GRADIENT);
-  parameter->randomize();
-  parameter->setID(pid);
-
-  return parameter;
-}
-
-ParameterPtr creatParameterBias(string name,
-                                int pid,
-                                size_t paraSize,
-                                bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-  paraConfig.set_initial_std(1);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ true);
-  parameter->randomize();
-  parameter->setID(pid);
-
-  return parameter;
-}
-
-LayerPtr initRecurrentLayer(LayerConfig layerConfig,
-                            size_t batchSize,
-                            int layerSize,
-                            bool useGpu) {
-  FLAGS_use_gpu = useGpu;
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-  LayerPtr dataLayer = creatDataLayer("layer_0", batchSize, layerSize, useGpu);
-  layerMap[dataLayer->getName()] = dataLayer;
-
-  ParameterPtr para =
-      creatParameter("para_0", 0, layerSize * layerSize, useGpu);
-  parameterMap[para->getName()] = para;
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->init(layerMap, parameterMap);
-  testLayer->setNeedGradient(true);
-
-  return testLayer;
-}
-
-void checkRecurrentLayer(LayerPtr testLayer) {
-  const VectorPtr& weightGrad =
-      (testLayer->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
-  const MatrixPtr& inputGrad = testLayer->getPrev(0)->getOutputGrad();
-  CpuVector seqPara(weightGrad->getSize());
-  CpuVector batPara(weightGrad->getSize());
-  CpuMatrix seqInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-  CpuMatrix batInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-
-  CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-  outputGrad.randomizeUniform();
-
-  /* use sequence calculate */
-  FLAGS_rnn_use_batch = false;
-  weightGrad->zero();
-  inputGrad->zero();
-  testLayer->forward(PASS_GC);
-  testLayer->getOutputGrad()->copyFrom(outputGrad);
-  testLayer->backward();
-  seqPara.copyFrom(*weightGrad);
-  seqInputGrad.copyFrom(*inputGrad);
-
-  /* use batch calculate */
-  FLAGS_rnn_use_batch = true;
-  weightGrad->zero();
-  inputGrad->zero();
-  testLayer->forward(PASS_GC);
-  testLayer->getOutputGrad()->copyFrom(outputGrad);
-  testLayer->backward();
-  batPara.copyFrom(*weightGrad);
-  batInputGrad.copyFrom(*inputGrad);
-
-  /* check */
-  checkError(seqInputGrad, batInputGrad);
-  checkError(seqPara, batPara);
-}
-
-TEST(Layer, RecurrentLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_name("rnn");
-  layerConfig.set_type("recurrent");
-  layerConfig.set_active_type("tanh");
-  for (auto layerSize : {1, 10, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 20, 100, 128}) {
-      for (auto useGpu : {false, true}) {
-        for (auto reversed : {false, true}) {
-          LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
-                    << " useGpu=" << useGpu << " reversed=" << reversed;
-          layerConfig.set_size(layerSize);
-          layerConfig.set_reversed(reversed);
-          LayerPtr testLayer =
-              initRecurrentLayer(layerConfig, batchSize, layerSize, useGpu);
-          checkRecurrentLayer(testLayer);
-        }
-      }
-    }
-  }
-}
-
-#define protected public
-#include "paddle/gserver/layers/GatedRecurrentLayer.h"
-#include "paddle/gserver/layers/LstmLayer.h"
-#include "paddle/gserver/layers/RecurrentLayer.h"
-template <class T>
-class TestRecurrentLayer {
- public:
-  LayerConfig config_;
-  bool useGpu_;
-  bool useBatch_;
-  LayerPtr testLayer_;
-  LayerPtr dataLayer_;
-  ParameterPtr para_;
-  ParameterPtr bias_;
-  LayerMap layerMap_;
-  ParameterMap parameterMap_;
-  TestRecurrentLayer(const LayerConfig& config,
-                     bool useGpu,
-                     bool useBatch = false)
-      : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
-  void init(size_t batchSize) {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_ = Layer::create(config_);
-    if (typeid(T) == typeid(GatedRecurrentLayer)) {
-      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize,
-                                  config_.size() * 3,
-                                  useGpu_);
-      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0,
-                             config_.size() * config_.size() * 3,
-                             useGpu_);
-      bias_ = creatParameterBias(
-          config_.bias_parameter_name(), 1, config_.size() * 3, useGpu_);
-    } else if (typeid(T) == typeid(LstmLayer)) {
-      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize,
-                                  config_.size() * 4,
-                                  useGpu_);
-      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0,
-                             config_.size() * config_.size() * 4,
-                             useGpu_);
-      bias_ = creatParameterBias(
-          config_.bias_parameter_name(), 1, config_.size() * 7, useGpu_);
-    }
-    layerMap_[dataLayer_->getName()] = dataLayer_;
-    parameterMap_[para_->getName()] = para_;
-    parameterMap_[bias_->getName()] = bias_;
-
-    layerMap_[testLayer_->getName()] = testLayer_;
-    testLayer_->init(layerMap_, parameterMap_);
-    testLayer_->setNeedGradient(true);
-    (dynamic_cast<T*>(testLayer_.get()))->useBatch_ = useBatch_;
-  }
-  void forward() {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_->forward(PASS_GC);
-  }
-  void backward() {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_->backward(nullptr);
-  }
-};
-
-template <class T>
-void checkRecurrentLayer(LayerConfig layerConfig,
-                         size_t batchSize,
-                         bool cpuBatch,
-                         bool gpuBatch) {
-  TestRecurrentLayer<T> testCpu(layerConfig, false, cpuBatch);
-  TestRecurrentLayer<T> testGpu(layerConfig, true, gpuBatch);
-  testCpu.init(batchSize);
-  testGpu.init(batchSize);
-  auto checkError = [](
-      MatrixPtr cpu, MatrixPtr gpu, int numSequences, const char* str) {
-    CpuMatrix check(gpu->getHeight(), gpu->getWidth());
-    check.copyFrom(*gpu);
-    int height = cpu->getHeight();
-    int width = cpu->getWidth();
-    const real* data1 = cpu->getData();
-    const real* data2 = check.getData();
-    int count = 0;
-    for (int i = 0; i < height; i++) {
-      for (int j = 0; j < width; j++) {
-        if (fabs(data1[i * width + j] - data2[i * width + j]) / numSequences >
-            1e-4) {
-          count++;
-        }
-      }
-    }
-    EXPECT_EQ(count, 0) << "[" << str << "]"
-                        << "There are " << count << " different element.";
-  };
-  T* cpuLayer = dynamic_cast<T*>(testCpu.testLayer_.get());
-  T* gpuLayer = dynamic_cast<T*>(testGpu.testLayer_.get());
-
-  Argument& cpuInput = testCpu.dataLayer_->getOutput();
-  Argument& gpuInput = testGpu.dataLayer_->getOutput();
-  gpuInput.resizeAndCopyFrom(cpuInput, true);
-
-  const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
-  const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);
-  gpuVec->copyFrom(*cpuVec);
-
-  const VectorPtr& cpuBiasVec = testCpu.bias_->getBuf(PARAMETER_VALUE);
-  const VectorPtr& gpuBiasVec = testGpu.bias_->getBuf(PARAMETER_VALUE);
-  gpuBiasVec->copyFrom(*cpuBiasVec);
-
-  /* check forward */
-  testCpu.forward();
-  testGpu.forward();
-
-  checkError(
-      cpuLayer->getOutputValue(), gpuLayer->getOutputValue(), 1, "outputValue");
-
-  /* check backward */
-  cpuLayer->getOutputGrad()->randomizeUniform();
-  gpuLayer->getOutputGrad()->copyFrom(*cpuLayer->getOutputGrad());
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-
-  testCpu.backward();
-  testGpu.backward();
-
-  // check input grad
-  checkError(cpuInput.grad, gpuInput.grad, 1, "inputGrad");
-  // check weight grad
-  int numSequences = cpuInput.getNumSequences();
-  checkError(cpuLayer->weight_->getWGrad(),
-             gpuLayer->weight_->getWGrad(),
-             numSequences,
-             "weightGrad");
-  // check bias grad
-  checkError(cpuLayer->bias_->getWGrad(),
-             gpuLayer->bias_->getWGrad(),
-             numSequences,
-             "biasGrad");
-}
-
-TEST(Layer, GatedRecurrentLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_type("gated_recurrent");
-  layerConfig.set_active_type("sigmoid");
-  layerConfig.set_active_gate_type("sigmoid");
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  layerConfig.set_bias_parameter_name("bias");
-
-  for (auto frameSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {false, true}) {
-        for (auto cpuBatch : {false, true}) {
-          for (auto gpuBatch : {false, true}) {
-            LOG(INFO) << " batchSize=" << batchSize
-                      << " frameSize=" << frameSize << " reversed=" << reversed
-                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
-            layerConfig.set_size(frameSize);
-            layerConfig.set_reversed(reversed);
-            checkRecurrentLayer<GatedRecurrentLayer>(
-                layerConfig, batchSize, cpuBatch, gpuBatch);
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Layer, LstmLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_type("lstmemory");
-  layerConfig.set_active_type("relu");
-  layerConfig.set_active_state_type("tanh");
-  layerConfig.set_active_gate_type("sigmoid");
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  layerConfig.set_bias_parameter_name("bias");
-
-  for (auto frameSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {false, true}) {
-        for (auto cpuBatch : {false, true}) {
-          for (auto gpuBatch : {false, true}) {
-            LOG(INFO) << " batchSize=" << batchSize
-                      << " frameSize=" << frameSize << " reversed=" << reversed
-                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
-            layerConfig.set_size(frameSize);
-            layerConfig.set_reversed(reversed);
-            checkRecurrentLayer<LstmLayer>(
-                layerConfig, batchSize, cpuBatch, gpuBatch);
-          }
-        }
-      }
-    }
-  }
-}
-
-#ifdef PADDLE_WITH_MKLML
-
-#include "paddle/gserver/layers/MKLPackedRecurrentLayer.h"
-
-LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
-                            bool reversed,
-                            int layerSize,
-                            LayerPtr dataLayer,
-                            ParameterPtr para,
-                            ParameterPtr bias = nullptr) {
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  parameterMap[para->getName()] = para;
-  if (bias) {
-    parameterMap[bias->getName()] = bias;
-    layerConfig.set_bias_parameter_name("bias_0");
-  }
-
-  layerConfig.set_size(layerSize);
-  layerConfig.set_reversed(reversed);
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->init(layerMap, parameterMap);
-  testLayer->setNeedGradient(true);
-
-  return testLayer;
-}
-
-void checkMKLPackedLayer(LayerConfig layerConfig1,
-                         LayerConfig layerConfig2,
-                         bool reversed,
-                         int layerSize,
-                         int batchSize,
-                         bool useBatch1,
-                         bool useBatch2) {
-  LayerPtr dataLayer;
-  ParameterPtr para, bias;
-
-  if (layerConfig1.type() == "recurrent") {
-    dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false);
-    para = creatParameter("para_0", 0, layerSize * layerSize, false);
-    bias = nullptr;
-  } else if (layerConfig1.type() == "gated_recurrent") {
-    dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false);
-    para = creatParameter("para_0", 0, layerSize * layerSize * 3, false);
-    bias = creatParameterBias("bias_0", 1, layerSize * 3, false);
-  }
-
-  LayerPtr testLayer1 = initMKLPackedLayer(
-      layerConfig1, reversed, layerSize, dataLayer, para, bias);
-  LayerPtr testLayer2 = initMKLPackedLayer(
-      layerConfig2, reversed, layerSize, dataLayer, para, bias);
-
-  const VectorPtr& weightGrad =
-      (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
-  const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
-  CpuVector wgt_grad1(weightGrad->getSize());
-  CpuVector wgt_grad2(weightGrad->getSize());
-  CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
-  CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
-
-  for (int i = 0; i < 2; i++) {
-    FLAGS_rnn_use_batch = useBatch1;
-
-    testLayer1->forward(PASS_GC);
-
-    FLAGS_rnn_use_batch = useBatch2;
-    testLayer2->forward(PASS_GC);
-
-    testLayer1->getOutputGrad()->randomizeUniform();
-    testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad());
-
-    weightGrad->zero();
-    inputGrad->zero();
-    FLAGS_rnn_use_batch = useBatch1;
-    testLayer1->backward(nullptr);
-
-    wgt_grad1.copyFrom(*weightGrad);
-    input_grad1.copyFrom(*inputGrad);
-
-    weightGrad->zero();
-    inputGrad->zero();
-    FLAGS_rnn_use_batch = useBatch2;
-    testLayer2->backward(nullptr);
-
-    wgt_grad2.copyFrom(*weightGrad);
-    input_grad2.copyFrom(*inputGrad);
-
-    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
-    checkError(wgt_grad1, wgt_grad2);
-    checkError(input_grad1, input_grad2);
-  }
-}
-
-TEST(MKLPackedLayer, RecurrentLayer) {
-  LayerConfig layerConfig1;
-  LayerConfig layerConfig2;
-
-  layerConfig1.set_name("paddle-rnn");
-  layerConfig1.set_type("recurrent");
-  layerConfig1.set_active_type("relu");
-
-  layerConfig2.set_name("mkl-packed-rnn");
-  layerConfig2.set_type("mkl_packed_recurrent");
-  layerConfig2.set_active_type("relu");
-
-  FLAGS_use_gpu = false;
-
-  for (auto layerSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {true, false}) {
-        for (auto paddle_use_batch : {true, false}) {
-          for (auto MKLPacked_use_batch : {true, false}) {
-            LOG(INFO) << " layerSize=" << layerSize
-                      << " batchSize=" << batchSize << " reversed=" << reversed
-                      << " paddle_use_batch=" << paddle_use_batch
-                      << " MKLPacked_use_batch=" << MKLPacked_use_batch;
-
-            checkMKLPackedLayer(layerConfig1,
-                                layerConfig2,
-                                reversed,
-                                layerSize,
-                                batchSize,
-                                paddle_use_batch,
-                                MKLPacked_use_batch);
-          }
-        }
-      }
-    }
-  }
-}
-#endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  if (!version::isWithGpu()) {
-    testing::GTEST_FLAG(filter) = "-Layer.*";
-  }
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
deleted file mode 100644
index 583e3bc545a3b5eb158490a8ccc5ea7060c7c6ab..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ /dev/null
@@ -1,471 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <math.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-#include <ctime>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/FullyConnectedLayer.h"
-#include "paddle/gserver/layers/Layer.h"
-#include "paddle/gserver/layers/SelectiveFullyConnectedLayer.h"
-#include "paddle/math/CpuSparseMatrix.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(num_passes);
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_string(config_args);
-
-size_t fcLayerWidth = 1024;
-
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-int randint(int* data, size_t int_max, size_t size) {
-  srand((size_t)(time(NULL)));
-  if (int_max < size) {
-    return -1;
-  }
-  size_t count = 0;
-  std::map<int, int> tmp;
-  int this_int = 0;
-
-  while (count < size) {
-    this_int = std::rand() % int_max;  // NOLINT
-    if (tmp.find(this_int) == tmp.end()) {
-      tmp[this_int] = 0;
-      count += 1;
-    }
-  }
-
-  if (tmp.size() != size) {
-    return -1;
-  }
-  count = 0;
-  for (auto itr = tmp.begin(); itr != tmp.end(); ++itr) {
-    data[count] = itr->first;
-    count += 1;
-  }
-  return 0;
-}
-
-void calcOutput(ComData& comData,
-                const string configFile,
-                const string configArgs,
-                bool useGpu) {
-  FLAGS_config = configFile;
-  FLAGS_config_args = configArgs;
-  FLAGS_use_gpu = useGpu;
-  FLAGS_init_model_path = "gserver/tests/SelectiveFcTest/model";
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlags(), false);
-
-  comData.parameters = trainer.getGradientMachine()->getParameters();
-
-  auto dataProvider = trainer.getDataProvider();
-  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
-  DataBatch dataBatch;
-  dataProvider->setSkipShuffle();
-  dataProvider->reset();
-  dataProvider->getNextBatch(batchSize, &dataBatch);
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-
-  vector<Argument>& inArgs = dataBatch.getStreams();
-  trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
-  trainer.getGradientMachine()->forwardBackward(
-      inArgs, &comData.outArgs, PASS_TRAIN);
-  trainer.getGradientMachine()->finish();
-}
-
-void checkMatrix(real* A, real* B, size_t matSize) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  int diffNum = 0;
-  for (size_t i = 0; i < matSize; ++i) {
-    if (std::isinf(A[i]) || std::isnan(A[i]) || std::isinf(B[i]) ||
-        std::isnan(B[i])) {
-    } else if (fabs(A[i] - B[i]) > err) {
-      diffNum++;
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-void checkTranspose(real* matrix,
-                    real* transpose,
-                    size_t width,
-                    size_t matSize) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  size_t height = matSize / width;
-  int diffNum = 0;
-  size_t rowId = 0;
-  size_t colId = 0;
-  for (size_t i = 0; i < matSize; ++i) {
-    if (i % width == 0 && i) {
-      rowId++;
-    }
-    colId = i % width;
-    if (fabs(matrix[i] - transpose[colId * height + rowId]) > err) {
-      diffNum++;
-      LOG(INFO) << i << " diff : " << matrix[i] << "\t"
-                << transpose[colId * height + rowId];
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-void compareOutput(ComData& fcData, ComData& selFcData) {
-  vector<Argument> outArgsFc = fcData.outArgs;
-  vector<Argument> outArgsSelfc = selFcData.outArgs;
-
-  // check cost
-  LOG(INFO) << "Check cost";
-  CpuMatrix fcCost(outArgsFc[0].value->getHeight(),
-                   outArgsFc[0].value->getWidth());
-  CpuMatrix selfcCost(outArgsSelfc[0].value->getHeight(),
-                      outArgsSelfc[0].value->getWidth());
-  fcCost.copyFrom(*outArgsFc[0].value);
-  selfcCost.copyFrom(*outArgsSelfc[0].value);
-  checkMatrix(fcCost.getData(), selfcCost.getData(), fcCost.getElementCnt());
-
-  // check selective fc output and fc output
-  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer "
-            << "with FullyConectedLayer";
-  CpuMatrix fcOut(outArgsFc[1].value->getHeight(),
-                  outArgsFc[1].value->getWidth());
-  CpuMatrix selfcOut(outArgsSelfc[1].value->getHeight(),
-                     outArgsSelfc[1].value->getWidth());
-
-  fcOut.copyFrom(*outArgsFc[1].value);
-  selfcOut.copyFrom(*outArgsSelfc[1].value);
-  checkMatrix(fcOut.getData(), selfcOut.getData(), fcOut.getElementCnt());
-
-  // check gradient math
-  vector<ParameterPtr>& fcParam = fcData.parameters;
-  vector<ParameterPtr>& selfcParam = selFcData.parameters;
-  for (size_t i = 0; i < fcParam.size(); ++i) {
-    ParameterPtr p1, p2;
-    p1 = fcParam[i];
-    p2 = selfcParam[i];
-
-    string paramName = p1->getName();
-    LOG(INFO) << "check parameter : " << paramName;
-
-    // check parameter value
-    CpuVector paraValue1(p1->getSize());
-    CpuVector paraValue2(p2->getSize());
-    paraValue1.copyFrom(*p1->getBuf(PARAMETER_VALUE));
-    paraValue2.copyFrom(*p2->getBuf(PARAMETER_VALUE));
-
-    // check gradient
-    CpuVector paraGrad1(*p1->getBuf(PARAMETER_GRADIENT));
-    CpuVector paraGrad2(*p2->getBuf(PARAMETER_GRADIENT));
-    if (paramName == "rand_fc_param.bias") {
-      checkMatrix(
-          paraValue1.getData(), paraValue2.getData(), paraValue1.getSize());
-      checkMatrix(
-          paraGrad1.getData(), paraGrad2.getData(), paraGrad1.getSize());
-    } else {
-      checkTranspose(paraValue1.getData(),
-                     paraValue2.getData(),
-                     fcLayerWidth,
-                     paraValue1.getSize());
-      checkTranspose(paraGrad1.getData(),
-                     paraGrad2.getData(),
-                     fcLayerWidth,
-                     paraGrad1.getSize());
-    }
-  }
-}
-
-void compareSparseMulOutput(
-    real* fcOutput,
-    real* selOutput,
-    size_t nnz,
-    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& selCols) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  size_t nnzCount =
-      std::accumulate(selCols->begin(),
-                      selCols->end(),
-                      0UL,
-                      [](size_t a, const std::pair<int*, size_t>& arr) {
-                        return a + arr.second;
-                      });
-  EXPECT_EQ(nnz, nnzCount);
-
-  size_t sampleNum = selCols->size();
-  int diffNum = 0;
-  size_t count = 0;
-  for (size_t i = 0; i < sampleNum; ++i) {
-    for (size_t j = 0; j < (*selCols)[i].second; ++j) {
-      size_t selIdx = (*selCols)[i].first[j];
-      if (fabs(fcOutput[i * fcLayerWidth + selIdx] - selOutput[count]) > err) {
-        diffNum++;
-        LOG(INFO) << count << " diff : " << fcOutput[i * fcLayerWidth + selIdx]
-                  << "\t" << selOutput[count];
-      }
-      count++;
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-LayerPtr creatDataLayer(string name,
-                        size_t batchSize,
-                        size_t layerSize,
-                        std::vector<real>& values,
-                        bool useGpu) {
-  LayerConfig dataConfig;
-  dataConfig.set_name(name);
-  dataConfig.set_type("data");
-  dataConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
-
-  Argument data;
-  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.value->copyFrom(values.data(), batchSize * layerSize);
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_TEST);
-  return layer;
-}
-
-ParameterPtr creatParameter(
-    string name, int pid, size_t paraSize, string paramFile, bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
-  parameter->enableType(PARAMETER_VALUE);
-  parameter->randomize();
-  parameter->setID(pid);
-  parameter->load(paramFile);
-  return parameter;
-}
-
-LayerPtr initFcLayer(LayerPtr dataLayer,
-                     LayerConfig layerConfig,
-                     int dataLayerSize,
-                     int fcLayerSize,
-                     string paraName,
-                     string paraFile,
-                     bool useGpu) {
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-
-  layerMap[dataLayer->getName()] = dataLayer;
-  ParameterPtr para = creatParameter(
-      paraName, 0, dataLayerSize * fcLayerSize, paraFile, useGpu);
-  parameterMap[para->getName()] = para;
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name(dataLayer->getName());
-  input.set_input_parameter_name(paraName);
-
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->setNeedGradient(false);
-  testLayer->init(layerMap, parameterMap);
-  return testLayer;
-}
-
-#ifndef PADDLE_TYPE_DOUBLE
-// The parameter file used in fc.conf and selective_fc.conf is float
-TEST(Layer, SelectiveFcLayer_train_dense_mul) {
-  const string& fcConfig = "gserver/tests/SelectiveFcTest/conf/fc.conf";
-  const string& fcConfigArgs =
-      "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
-  const string& selFcConfig =
-      "gserver/tests/SelectiveFcTest/conf/selective_fc.conf";
-  const string& selConfigArgs =
-      "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
-
-  for (auto useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-    if (useGpu) {
-      break;
-    }
-#endif
-    LOG(INFO) << "FullyConnectedLayer forwardBackward()";
-    ComData fcData;
-    calcOutput(fcData, fcConfig, fcConfigArgs, useGpu);
-
-    LOG(INFO) << "SelectiveFullyConnectedLayer forwardBackward()";
-    ComData selFcData;
-    calcOutput(selFcData, selFcConfig, selConfigArgs, useGpu);
-    compareOutput(fcData, selFcData);
-  }
-}
-#endif  // PADDLE_TYPE_DOUBLE
-
-void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
-                                        bool useGpu) {
-  FLAGS_use_gpu = useGpu;
-  size_t batchSize = 100;
-  size_t dataLayerSize = 512;
-  std::vector<real> values(batchSize * dataLayerSize);
-  for (size_t j = 0; j < batchSize * dataLayerSize; ++j) {
-    values[j] = std::rand() / real(RAND_MAX);
-  }
-  LayerPtr dataLayer =
-      creatDataLayer("data", batchSize, dataLayerSize, values, useGpu);
-
-  const string& selfcParaFile =
-      "gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
-  const string& selfcParaName = "rand_fc_param.w.transpose";
-
-  std::shared_ptr<SelectiveFullyConnectedLayer> selfcLayer =
-      std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(
-          initFcLayer(dataLayer,
-                      config,
-                      dataLayerSize,
-                      fcLayerWidth,
-                      selfcParaName,
-                      selfcParaFile,
-                      useGpu));
-
-  // create selected columns
-  std::shared_ptr<std::vector<std::pair<int*, size_t>>> selCols(
-      new std::vector<std::pair<int*, size_t>>(batchSize));
-  size_t maxNNZ = 30;
-  srand((size_t)(time(NULL)));
-  int total = 0;
-  while (total == 0) {
-    for (size_t i = 0; i < batchSize; ++i) {
-      size_t num = std::rand() % maxNNZ;
-      int* data = new int[num];
-      randint(data, fcLayerWidth, num);
-      (*selCols)[i] = std::make_pair(data, num);
-      total += num;
-    }
-  }
-  selfcLayer->fillSelectiveData(selCols);
-  selfcLayer->forward(PASS_TEST);
-
-  MatrixPtr outMatSelfc = selfcLayer->getOutputValue();
-  CpuSparseMatrixPtr cpuOutMatSelfc(
-      new CpuSparseMatrix(outMatSelfc->getHeight(),
-                          outMatSelfc->getWidth(),
-                          outMatSelfc->getElementCnt()));
-  cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
-#ifdef PADDLE_WITH_CUDA
-  if (useGpu) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-#endif
-  real* outValueSelfc = cpuOutMatSelfc->getValue();
-  size_t nnz = cpuOutMatSelfc->getElementCnt();
-
-  const string& fcParaFile =
-      "gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
-  const string& fcParaName = "rand_fc_param.w";
-  LayerConfig fcLayerConfig;
-  fcLayerConfig.set_name("fc_layer");
-  fcLayerConfig.set_type("fc");
-  fcLayerConfig.set_active_type("linear");
-  fcLayerConfig.set_size(fcLayerWidth);
-
-  LayerPtr fcLayer = initFcLayer(dataLayer,
-                                 fcLayerConfig,
-                                 dataLayerSize,
-                                 fcLayerWidth,
-                                 fcParaName,
-                                 fcParaFile,
-                                 useGpu);
-  fcLayer->forward(PASS_TEST);
-
-  MatrixPtr outMatFc = fcLayer->getOutputValue();
-  MatrixPtr cpuOutMatFc(
-      new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
-  cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
-#ifdef PADDLE_WITH_CUDA
-  if (useGpu) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-#endif
-  real* outValueFc = cpuOutMatFc->getData();
-
-  compareSparseMulOutput(outValueFc, outValueSelfc, nnz, selCols);
-  for (size_t i = 0; i < batchSize; ++i) {
-    delete[](*selCols)[i].first;
-  }
-}
-
-#ifndef PADDLE_TYPE_DOUBLE
-// The parameter file used in testSelectiveFcLayerTrainSparseMul is float
-TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
-  LayerConfig selLayerConfig;
-  selLayerConfig.set_name("sel_fc");
-  selLayerConfig.set_type("selective_fc");
-  selLayerConfig.set_active_type("linear");
-  selLayerConfig.set_has_selected_colums(false);
-  selLayerConfig.set_selective_fc_pass_generation(true);
-  selLayerConfig.set_size(fcLayerWidth);
-
-  testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
-#ifdef PADDLE_WITH_CUDA
-  testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
-#endif
-}
-#endif  // PADDLE_TYPE_DOUBLE
-
-// TODO(dangqingqing) test multi threads after support in matrix
-// TEST(Layer, SelectiveFcLayer_train_sparse_mul_parallel) {
-//   LayerConfig selLayerConfig;
-//   selLayerConfig.set_name("sel_fc");
-//   selLayerConfig.set_type("selective_fc");
-//   selLayerConfig.set_active_type("linear");
-//   selLayerConfig.set_has_selected_colums(false);
-//   selLayerConfig.set_selective_fc_pass_generation(true);
-//   selLayerConfig.set_selective_fc_parallel_plain_mul_thread_num(10);
-//   selLayerConfig.set_selective_fc_full_mul_ratio(1000);
-//   selLayerConfig.set_size(fcLayerWidth);
-//   SelectiveFcLayer_test(selLayerConfig, false);
-// }
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
deleted file mode 100644
index 406ca63b6ee030a0882e05294d8d355d84531385..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-const int MAX_SEQ_NUM = 17;
-const int MAX_SEQ_LEN = 23;
-const int MAX_BEAM_SIZE = 13;
-
-const size_t SEED = (size_t)(time(NULL));
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-void genSeqInfo(vector<int>& seqStartPos, vector<int>& subSeqStartPos) {
-  seqStartPos.resize(1, 0);
-  subSeqStartPos.resize(1, 0);
-
-  srand(SEED);
-  int seqNum = 1 + (rand() % MAX_SEQ_NUM);
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqNum = 1 + (rand() % MAX_SEQ_NUM);
-    for (int j = 0; j < subSeqNum; ++j)
-      subSeqStartPos.push_back(subSeqStartPos.back() +
-                               (1 + (rand() % MAX_SEQ_LEN)));
-    seqStartPos.push_back(subSeqStartPos.back());
-  }
-}
-
-/*
-  generate start indices according to sequence start positions.
- */
-void genStarts(vector<int>& seqStartPos,
-               vector<vector<real>>& starts,
-               size_t beamSize) {
-  starts.clear();
-  starts.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
-
-  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
-    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-    vector<real> randStarts =
-        randSampling(seqLen, min(seqLen, static_cast<int>(beamSize)));
-    copy(begin(randStarts), end(randStarts), begin(starts[i]));
-  }
-}
-
-/*
-  generate end indices according to sequence start positions and start indices.
- */
-void genEnds(vector<int>& seqStartPos,
-             vector<vector<real>>& starts,
-             vector<vector<real>>& ends,
-             size_t beamSize) {
-  CHECK_EQ(seqStartPos.size() - 1, starts.size());
-  ends.clear();
-  ends.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
-
-  for (size_t i = 0; i < starts.size(); ++i) {
-    for (size_t j = 0; j < starts[i].size(); ++j) {
-      int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-      CHECK_GE(seqLen - 1, starts[i][j]);
-      if (starts[i][j] == -1.) break;
-      if (starts[i][j] == (seqLen - 1)) {
-        ends[i][j] = starts[i][j];
-      } else {
-        ends[i][j] = starts[i][j] + randSampling(seqLen - starts[i][j], 1)[0];
-      }
-    }
-  }
-}
-
-void genTestData(vector<int>& seqStartPos,
-                 vector<int>& subSeqStartPos,
-                 vector<vector<real>>& starts,
-                 vector<vector<real>>& ends,
-                 bool hasSubseq) {
-  size_t beamSize = 1 + (rand() % MAX_BEAM_SIZE);
-  genSeqInfo(seqStartPos, subSeqStartPos);
-
-  genStarts(hasSubseq ? subSeqStartPos : seqStartPos, starts, beamSize);
-  genEnds(hasSubseq ? subSeqStartPos : seqStartPos, starts, ends, beamSize);
-}
-
-template <typename T>
-void flatten2dVector(vector<vector<T>>& inVec, vector<T>& outVec) {
-  size_t totalSize{0};
-  for (auto const& items : inVec) totalSize += items.size();
-  outVec.reserve(totalSize);
-
-  for (auto& items : inVec)
-    move(items.begin(), items.end(), back_inserter(outVec));
-}
-
-void testSeqSliceLayer(bool hasSubseq,
-                       bool useGpu,
-                       vector<int>& seqStartPos,
-                       vector<int>& subSeqStartPos,
-                       vector<vector<real>>& starts,
-                       vector<vector<real>>& ends) {
-  // layer size is not crutial for this layer,
-  // so here use a small layer size in the unittest.
-  const size_t layerSize{4};
-  TestConfig config;
-  config.layerConfig.set_type("seq_slice");
-  config.layerConfig.set_size(layerSize);
-
-  // add the first input
-  MatrixPtr seqInputPtr =
-      Matrix::create(hasSubseq ? subSeqStartPos.back() : seqStartPos.back(),
-                     layerSize,
-                     false,
-                     false);
-  seqInputPtr->randomizeUniform();
-
-  if (hasSubseq) {
-    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                "seq_input",
-                                seqInputPtr,
-                                seqStartPos,
-                                subSeqStartPos});
-  } else {
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, "seq_input", seqInputPtr, seqStartPos});
-  }
-  config.layerConfig.add_inputs();
-
-  // add start indices
-  if (starts.size()) {
-    vector<real> startsToVec;
-    flatten2dVector(starts, startsToVec);
-
-    MatrixPtr startMatrixPtr =
-        Matrix::create(starts.size(), starts[0].size(), false, false);
-    startMatrixPtr->copyFrom(startsToVec.data(), startsToVec.size());
-
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, "starts", startMatrixPtr});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_select_first(true);
-  }
-
-  // add end indices
-  if (ends.size()) {
-    vector<real> endsToVec;
-    flatten2dVector(ends, endsToVec);
-
-    MatrixPtr endMatrixPtr =
-        Matrix::create(ends.size(), ends[0].size(), false, false);
-    endMatrixPtr->copyFrom(endsToVec.data(), endsToVec.size());
-
-    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "ends", endMatrixPtr});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_select_first(false);
-  }
-
-  testLayerGrad(config, "seq_slice", /*batchSize*/ 100, false, useGpu, false);
-}
-
-TEST(Layer, SeqSliceLayer) {
-  vector<int> seqStartPos;
-  vector<int> subSeqStartPos;
-  vector<vector<real>> starts;
-  vector<vector<real>> ends;
-
-  std::vector<bool> mode = {false};
-#ifdef PADDLE_WITH_CUDA
-  mode.push_back(true);
-#endif
-  genSeqInfo(seqStartPos, subSeqStartPos);
-  for (bool hasSubseq : {true, false}) {
-    LOG(INFO) << "hasSubSeq : " << hasSubseq;
-    genTestData(seqStartPos, subSeqStartPos, starts, ends, hasSubseq);
-    for (bool useGpu : mode) {
-      vector<vector<real>> tmp;
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, tmp, ends);
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, tmp);
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, ends);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_Upsample.cpp b/paddle/gserver/tests/test_Upsample.cpp
deleted file mode 100644
index 39b902fcc75e71007f855e4e258e54ed8d40f16b..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_Upsample.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/testing/TestUtil.h"
-
-void setPoolConfig(paddle::TestConfig* config,
-                   paddle::PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(1);
-
-  int kw = 2, kh = 2;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(2);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow =
-      paddle::outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh =
-      paddle::outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-paddle::LayerPtr doOneUpsampleTest(const paddle::MatrixPtr& inputMat,
-                                   const string& poolType,
-                                   bool use_gpu,
-                                   real* tempGradData) {
-  /* prepare maxPoolWithMaskLayer */
-  paddle::TestConfig config;
-  config.inputDefs.push_back({paddle::INPUT_DATA, "layer_0", 128, 0});
-  paddle::LayerInputConfig* input = config.layerConfig.add_inputs();
-  paddle::PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(8);
-  pool->set_img_size_y(8);
-  setPoolConfig(&config, pool, "max-pool-with-mask");
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  config.layerConfig.set_name("MaxPoolWithMask");
-
-  std::vector<paddle::DataLayerPtr> dataLayers;
-  paddle::LayerMap layerMap;
-  vector<paddle::Argument> datas;
-
-  initDataLayer(config,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                "MaxPoolWithMask",
-                1,
-                false,
-                use_gpu);
-
-  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
-
-  FLAGS_use_gpu = use_gpu;
-  std::vector<paddle::ParameterPtr> parameters;
-  paddle::LayerPtr maxPoolingWithMaskOutputLayer;
-  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
-  maxPoolingWithMaskOutputLayer->forward(paddle::PASS_GC);
-
-  /* prepare the upsample layer */
-  paddle::LayerConfig upsampleLayerConfig;
-  upsampleLayerConfig.set_type("upsample");
-  paddle::LayerInputConfig* input1 = upsampleLayerConfig.add_inputs();
-  upsampleLayerConfig.add_inputs();
-
-  paddle::UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf();
-  upsampleConfig->set_scale(2);
-  paddle::ImageConfig* imageConfig = upsampleConfig->mutable_image_conf();
-  imageConfig->set_channels(2);
-  imageConfig->set_img_size(4);
-  imageConfig->set_img_size_y(4);
-  upsampleLayerConfig.set_size(2 * 8 * 8);
-  upsampleLayerConfig.set_name("upsample");
-
-  for (size_t i = 0; i < 2; i++) {
-    paddle::LayerInputConfig& inputTemp =
-        *(upsampleLayerConfig.mutable_inputs(i));
-    inputTemp.set_input_layer_name("MaxPoolWithMask");
-  }
-
-  paddle::LayerPtr upsampleLayer;
-  paddle::ParameterMap parameterMap;
-  upsampleLayer = paddle::Layer::create(upsampleLayerConfig);
-  layerMap[upsampleLayerConfig.name()] = upsampleLayer;
-  upsampleLayer->init(layerMap, parameterMap);
-  upsampleLayer->setNeedGradient(true);
-  upsampleLayer->forward(paddle::PASS_GC);
-  upsampleLayer->getOutputGrad()->copyFrom(tempGradData, 128);
-  upsampleLayer->backward();
-
-  return upsampleLayer;
-}
-
-TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
-  bool useGpu = false;
-  paddle::MatrixPtr inputMat;
-  paddle::MatrixPtr inputGPUMat;
-  paddle::MatrixPtr tempGradMat;
-
-  inputMat = paddle::Matrix::create(1, 128, false, useGpu);
-  inputMat->randomizeUniform();
-
-  tempGradMat = paddle::Matrix::create(1, 128, false, useGpu);
-  tempGradMat->randomizeUniform();
-  real* tempGradData = tempGradMat->getData();
-
-  paddle::LayerPtr upsampleLayerCPU =
-      doOneUpsampleTest(inputMat, "max-pool-with-mask", useGpu, tempGradData);
-
-#ifdef PADDLE_WITH_CUDA
-  useGpu = true;
-  real* data = inputMat->getData();
-  inputGPUMat = paddle::Matrix::create(1, 128, false, useGpu);
-  inputGPUMat->copyFrom(data, 128);
-  paddle::LayerPtr upsampleLayerGPU = doOneUpsampleTest(
-      inputGPUMat, "max-pool-with-mask", useGpu, tempGradData);
-  paddle::checkMatrixEqual(upsampleLayerCPU->getOutput("").value,
-                           upsampleLayerGPU->getOutput("").value);
-
-  paddle::checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(),
-                           upsampleLayerGPU->getPrev(0)->getOutputGrad());
-#endif
-}
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
deleted file mode 100644
index f2299d7da2a51e4015793ae531af002aed1f6b2f..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/Version.h>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/CTCLayer.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/Layer.h"
-#include "paddle/gserver/layers/WarpCTCLayer.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-
-const real* getData(const Matrix& matrix) {
-  if (matrix.useGpu()) {
-    MatrixPtr cpuMatrix = Matrix::create(
-        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
-    cpuMatrix->copyFrom(matrix);
-    return cpuMatrix->getData();
-  } else {
-    return matrix.getData();
-  }
-}
-
-int checkError(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
-  CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
-  CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-
-  const real* data1 = getData(matrix1);
-  const real* data2 = getData(matrix2);
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-  return count;
-}
-
-void initArgument(size_t batchSize,
-                  int layerSize,
-                  bool useGpu,
-                  Argument& data) {
-  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.value->randomizeUniform();
-  data.value->add(-0.5);
-  data.grad->zeroMem();
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-}
-
-LayerPtr createDataLayer(
-    string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) {
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("data");
-  layerConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-LayerPtr createLabelLayer(string name,
-                          size_t batchSize,
-                          size_t numClasses,
-                          bool useGpu) {
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("data");
-  layerConfig.set_size(1);
-  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
-
-  Argument data;
-  data.ids = IVector::create(batchSize, useGpu);
-  data.ids->rand(numClasses - 1);
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-
-  DataLayerPtr labelLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  labelLayer->setData(data);
-  labelLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-LayerPtr createCTCLayer(string name,
-                        size_t numClasses,
-                        bool useGpu,
-                        bool normByTimes,
-                        LayerPtr dataLayer,
-                        LayerPtr labelLayer) {
-  LayerMap layerMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  layerMap[labelLayer->getName()] = labelLayer;
-
-  ParameterMap parameterMap;
-
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("ctc");
-  layerConfig.set_size(numClasses);
-  layerConfig.set_norm_by_times(normByTimes);
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
-  input0.set_input_layer_name(dataLayer->getName());
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
-  input1.set_input_layer_name(labelLayer->getName());
-
-  LayerPtr layer = LayerPtr(new CTCLayer(layerConfig));
-  layerMap[layer->getName()] = layer;
-  layer->init(layerMap, parameterMap);
-
-  ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
-
-  softmaxActivation->forward(dataLayer->getOutput()).check();
-  layer->forward(PASS_GC);
-
-  layer->backward();
-  softmaxActivation->backward(dataLayer->getOutput()).check();
-
-  return layer;
-}
-
-LayerPtr createWarpCTCLayer(string name,
-                            size_t numClasses,
-                            bool useGpu,
-                            bool normByTimes,
-                            LayerPtr dataLayer,
-                            LayerPtr labelLayer) {
-  LayerMap layerMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  layerMap[labelLayer->getName()] = labelLayer;
-
-  ParameterMap parameterMap;
-
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("warp_ctc");
-  layerConfig.set_size(numClasses);
-  layerConfig.set_blank(numClasses - 1);
-  layerConfig.set_norm_by_times(normByTimes);
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
-  input0.set_input_layer_name(dataLayer->getName());
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
-  input1.set_input_layer_name(labelLayer->getName());
-
-  LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig));
-  layerMap[layer->getName()] = layer;
-  layer->init(layerMap, parameterMap);
-
-  layer->forward(PASS_GC);
-  layer->backward();
-
-  return layer;
-}
-
-TEST(Layer, WarpCTCLayer) {
-  for (auto layerSize : {10, 64}) {
-    for (auto batchSize : {1, 10, 32}) {
-      for (auto normByTimes : {false, true}) {
-        for (auto useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-          if (useGpu) continue;
-#endif
-          LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
-                    << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
-
-          FLAGS_use_gpu = useGpu;
-
-          Argument data0;
-          initArgument(batchSize, layerSize, useGpu, data0);
-
-          Argument data1;
-          data1.resizeAndCopyFrom(data0);
-
-          LayerPtr dataLayer0 =
-              createDataLayer("data", batchSize, layerSize, useGpu, data0);
-          LayerPtr dataLayer1 =
-              createDataLayer("data", batchSize, layerSize, useGpu, data1);
-
-          LayerPtr labelLayer =
-              createLabelLayer("label", batchSize, layerSize, useGpu);
-
-          LayerPtr warpctcLayer = createWarpCTCLayer(
-              "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer);
-          LayerPtr ctcLayer = createCTCLayer(
-              "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
-
-          /// Check cost
-          LOG(INFO) << "Check cost: "
-                    << checkError(*(warpctcLayer->getOutput().value),
-                                  *(ctcLayer->getOutput().value))
-                    << " different elements.";
-
-          /// Check gradients
-          LOG(INFO) << "Check gradients: "
-                    << checkError(*(dataLayer0->getOutput().grad),
-                                  *(dataLayer1->getOutput().grad))
-                    << " different elements";
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/legacy/api/Arguments.cpp b/paddle/legacy/api/Arguments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7bb5a6f75b9a8ab800fc74c6cc01c0b104ccdd5e
--- /dev/null
+++ b/paddle/legacy/api/Arguments.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+#include "paddle/legacy/parameter/Argument.h"
+
+size_t Arguments::getSlotNum() const { return m->outputs.size(); }
+
+Arguments* Arguments::createArguments(size_t slotNum) {
+  auto args = new Arguments();
+  args->m->outputs.resize(slotNum);
+  return args;
+}
+
+void Arguments::resize(size_t slotNum) { m->outputs.resize(slotNum); }
+
+Arguments::Arguments() : m(new ArgumentsPrivate()) {}
+
+Arguments::~Arguments() { delete m; }
+
+Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
+  auto p = (std::vector<paddle::Argument>*)(ptr);
+  auto args = new Arguments();
+  args->m->outputs = *p;
+  return args;
+}
+
+Arguments* Arguments::createByPaddleArgument(const void* ptr) {
+  auto p = (paddle::Argument*)(ptr);
+  auto args = new Arguments();
+  args->m->outputs.push_back(*p);
+  return args;
+}
+
+Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.value);
+}
+
+Matrix* Arguments::getSlotGrad(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.grad);
+}
+
+IVector* Arguments::getSlotIds(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return IVector::createByPaddleVectorPtr(&a.ids);
+}
+
+Matrix* Arguments::getSlotIn(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.in);
+}
+
+void Arguments::setSlotValue(size_t idx, Matrix* mat) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.value = m->cast<paddle::Matrix>(mat->getSharedPtr());
+}
+
+void Arguments::setSlotGrad(size_t idx, Matrix* mat) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.grad = m->cast<paddle::Matrix>(mat->getSharedPtr());
+}
+
+void Arguments::setSlotIn(size_t idx, Matrix* mat) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.in = m->cast<paddle::Matrix>(mat->getSharedPtr());
+}
+
+void Arguments::setSlotIds(size_t idx, IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
+  a.ids = v;
+}
+
+template <typename T1>
+static inline void doCopyFromSafely(std::shared_ptr<T1>& dest,
+                                    std::shared_ptr<T1>& src) {
+  if (src) {
+    if (dest) {
+      dest->copyFrom(*src);
+    } else {
+      dest = src;
+    }
+  }
+}
+
+IVector* Arguments::getSlotSequenceStartPositions(size_t idx) const
+    throw(RangeError) {
+  auto& a = m->getArg(idx);
+  if (a.sequenceStartPositions) {
+    return IVector::createByPaddleVectorPtr(
+        &a.sequenceStartPositions->getMutableVector(false));
+  } else {
+    return nullptr;
+  }
+}
+
+IVector* Arguments::getSlotSubSequenceStartPositions(size_t idx) const
+    throw(RangeError) {
+  auto& a = m->getArg(idx);
+  if (a.subSequenceStartPositions) {
+    return IVector::createByPaddleVectorPtr(
+        &a.subSequenceStartPositions->getMutableVector(false));
+  } else {
+    return nullptr;
+  }
+}
+
+void Arguments::setSlotSequenceStartPositions(size_t idx,
+                                              IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
+  a.sequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
+}
+
+void Arguments::setSlotSubSequenceStartPositions(
+    size_t idx, IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
+  a.subSequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
+}
+
+IVector* Arguments::getSlotSequenceDim(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return IVector::createByPaddleVectorPtr(&a.cpuSequenceDims);
+}
+
+void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
+}
+
+float Arguments::sum() const { return paddle::Argument::sum(m->outputs); }
+
+int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return a.getBatchSize();
+}
+
+void Arguments::setSlotFrameHeight(size_t idx, size_t h) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.setFrameHeight(h);
+}
+
+void Arguments::setSlotFrameWidth(size_t idx, size_t w) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.setFrameWidth(w);
+}
+
+size_t Arguments::getSlotFrameHeight(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return a.getFrameHeight();
+}
+
+size_t Arguments::getSlotFrameWidth(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return a.getFrameWidth();
+}
+
+void* Arguments::getInternalArgumentsPtr() const { return &m->outputs; }
diff --git a/paddle/api/CMakeLists.txt b/paddle/legacy/api/CMakeLists.txt
similarity index 100%
rename from paddle/api/CMakeLists.txt
rename to paddle/legacy/api/CMakeLists.txt
diff --git a/paddle/api/ConfigParser.cpp b/paddle/legacy/api/ConfigParser.cpp
similarity index 100%
rename from paddle/api/ConfigParser.cpp
rename to paddle/legacy/api/ConfigParser.cpp
diff --git a/paddle/api/Evaluator.cpp b/paddle/legacy/api/Evaluator.cpp
similarity index 100%
rename from paddle/api/Evaluator.cpp
rename to paddle/legacy/api/Evaluator.cpp
diff --git a/paddle/legacy/api/GradientMachine.cpp b/paddle/legacy/api/GradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ad2fe11a4c668a318f76492f57091f386183986
--- /dev/null
+++ b/paddle/legacy/api/GradientMachine.cpp
@@ -0,0 +1,196 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+#include "Internal.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+
+std::vector<int> GradientMachine::defaultParamTypes = {
+    PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
+
+GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
+
+GradientMachine::~GradientMachine() { delete m; }
+
+GradientMachine* GradientMachine::createFromPaddleModelPtr(
+    const void* confPtr,
+    GradientMatchineCreateMode mode,
+    const std::vector<int>& types) {
+  auto& conf = *(const paddle::ModelConfig*)(confPtr);
+  std::vector<ParameterType> realTypes;
+  staticCastVector(&realTypes, types);
+  auto machineRawPtr = paddle::GradientMachine::create(conf, mode, realTypes);
+  auto machinePtr = std::shared_ptr<paddle::GradientMachine>(machineRawPtr);
+  if (machinePtr != nullptr) {
+    auto machine = new GradientMachine();
+    machine->m->machine = machinePtr;
+    return machine;
+  } else {
+    return nullptr;
+  }
+}
+
+GradientMachine* GradientMachine::createByConfigProtoStr(
+    const std::string& protoStr,
+    GradientMatchineCreateMode mode,
+    const std::vector<int>& types) {
+  paddle::ModelConfig conf;
+  conf.ParseFromString(protoStr);
+  if (conf.IsInitialized()) {
+    return GradientMachine::createFromPaddleModelPtr(&conf, mode, types);
+  } else {
+    return nullptr;
+  }
+}
+
+GradientMachine* GradientMachine::createByModelConfig(
+    ModelConfig* conf,
+    GradientMatchineCreateMode mode,
+    const std::vector<int>& types) {
+  auto confPtr = &conf->m->conf->getModelConfig();
+  return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
+}
+
+void GradientMachine::start() { m->machine->start(); }
+
+void GradientMachine::finish() { m->machine->finish(); }
+
+void GradientMachine::onPassEnd() { m->machine->onPassEnd(); }
+
+void GradientMachine::prefetch(const Arguments& inArgs) {
+  auto& in =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  m->machine->prefetch(in);
+}
+
+void GradientMachine::forward(const Arguments& inArgs,
+                              Arguments* outArgs,
+                              PassType passType) {
+  auto& in =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  auto& out = m->cast<std::vector<paddle::Argument>>(
+      outArgs->getInternalArgumentsPtr());
+  paddle::PassType pt = (paddle::PassType)(passType);
+  m->machine->forward(in, &out, pt);
+}
+
+UpdateCallback::~UpdateCallback() {}
+
+void UpdateCallback::apply(Parameter* p) {
+  // UNUSED(p);
+}
+
+class UpdateCallbackWrapper {
+ public:
+  explicit UpdateCallbackWrapper(const UpdateCallback& callback)
+      : callback(const_cast<UpdateCallback&>(callback)) {}
+
+  void operator()(paddle::Parameter* param) {
+    auto p = Parameter::createFromRawPtr(&param);
+    // @TODO Use Stack variable instead.
+    callback.apply(p);
+    delete p;
+  }
+
+ private:
+  UpdateCallback& callback;
+};
+
+void GradientMachine::backward(const UpdateCallback& callback) {
+  m->machine->backward(UpdateCallbackWrapper(callback));
+}
+
+void GradientMachine::forwardBackward(const Arguments& inArgs,
+                                      Arguments* outArgs,
+                                      PassType passType,
+                                      const UpdateCallback& callback) {
+  auto& in =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  auto& out = m->cast<std::vector<paddle::Argument>>(
+      outArgs->getInternalArgumentsPtr());
+  paddle::PassType pt = (paddle::PassType)(passType);
+  m->machine->forwardBackward(in, &out, pt, UpdateCallbackWrapper(callback));
+}
+
+void GradientMachine::loadParameters(const std::string& path) {
+  m->machine->loadParameters(path);
+}
+
+size_t GradientMachine::getParameterSize() const {
+  return m->machine->getParameters().size();
+}
+
+Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
+  auto params = m->machine->getParameters();
+  if (i < params.size()) {
+    return Parameter::createFromSharedPtr(&m->machine->getParameters()[i]);
+  } else {
+    throw RangeError();
+  }
+}
+
+size_t GradientMachine::getNonStaticParameterSize() const {
+  return m->machine->getNonStaticParameters().size();
+}
+
+Parameter* GradientMachine::getNonStaticParameter(size_t i) throw(RangeError) {
+  auto params = m->machine->getNonStaticParameters();
+  if (i < params.size()) {
+    return Parameter::createFromSharedPtr(
+        &m->machine->getNonStaticParameters()[i]);
+  } else {
+    throw RangeError();
+  }
+}
+
+void GradientMachine::randParameters() { m->machine->randParameters(); }
+
+Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const
+    throw(UnsupportError) {
+  auto nn = m->machine;
+  if (nn) {
+    auto arg = nn->getLayerOutput(layerName);
+    return Arguments::createByPaddleArgument(&arg);
+  } else {
+    throw UnsupportError();
+  }
+}
+
+SequenceGenerator* GradientMachine::asSequenceGenerator(
+    const std::vector<std::string>& dict,
+    size_t begin_id,
+    size_t end_id,
+    size_t max_length,
+    size_t beam_size) {
+  SequenceGenerator* r =
+      SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
+  r->setDict(dict);
+  r->setBos(begin_id);
+  r->setEos(end_id);
+  r->setMaxLength(max_length);
+  r->setBeamSize(beam_size);
+  return r;
+}
+
+Evaluator* GradientMachine::makeEvaluator() {
+  auto ev = new Evaluator();
+  ev->m->rawPtr = m->machine->makeEvaluator();
+  return ev;
+}
+
+void GradientMachine::eval(Evaluator* evaluator) {
+  m->machine->eval(evaluator->m->rawPtr);
+}
diff --git a/paddle/api/Internal.h b/paddle/legacy/api/Internal.h
similarity index 100%
rename from paddle/api/Internal.h
rename to paddle/legacy/api/Internal.h
diff --git a/paddle/legacy/api/Matrix.cpp b/paddle/legacy/api/Matrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8862d0ea92c92a2608b49c6b1315badae9e9fd98
--- /dev/null
+++ b/paddle/legacy/api/Matrix.cpp
@@ -0,0 +1,317 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/math/Matrix.h"
+#include <cstring>
+#include <iostream>
+#include "PaddleAPI.h"
+#include "paddle/legacy/math/CpuSparseMatrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+struct MatrixPrivate {
+  std::shared_ptr<paddle::Matrix> mat;
+};
+
+Matrix::Matrix() : m(new MatrixPrivate()) {}
+
+Matrix* Matrix::createByPaddleMatrixPtr(void* sharedPtr) {
+  auto* mat = reinterpret_cast<paddle::MatrixPtr*>(sharedPtr);
+  if ((*mat) != nullptr) {
+    auto m = new Matrix();
+    m->m->mat = *mat;
+    return m;
+  } else {
+    return nullptr;
+  }
+}
+
+Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::create(height, width, useGpu);
+  m->m->mat->zero();
+  return m;
+}
+
+Matrix* Matrix::createDense(const std::vector<float>& data,
+                            size_t height,
+                            size_t width,
+                            bool useGpu) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::create(height, width, useGpu);
+  m->m->mat->copyFrom(data.data(), data.size());
+  return m;
+}
+
+Matrix* Matrix::createDenseFromNumpy(float* data,
+                                     int dim1,
+                                     int dim2,
+                                     bool copy,
+                                     bool useGpu) throw(UnsupportError) {
+  if (useGpu) {
+    /// Gpu mode only supports copy=True
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return Matrix::createGpuDenseFromNumpy(data, dim1, dim2);
+  } else {
+    return Matrix::createCpuDenseFromNumpy(data, dim1, dim2, copy);
+  }
+}
+
+Matrix* Matrix::createCpuDenseFromNumpy(float* data,
+                                        int dim1,
+                                        int dim2,
+                                        bool copy) {
+  auto m = new Matrix();
+  if (copy) {
+    m->m->mat = paddle::Matrix::create(dim1, dim2);
+    m->m->mat->copyFrom(data, dim1 * dim2);
+  } else {
+    m->m->mat = paddle::Matrix::create(data, dim1, dim2, false);
+  }
+  return m;
+}
+
+Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::create(dim1, dim2, false, true);
+  m->m->mat->copyFrom(data, dim1 * dim2);
+  return m;
+}
+
+Matrix* Matrix::createSparse(size_t height,
+                             size_t width,
+                             size_t nnz,
+                             bool isNonVal,
+                             bool isTrans,
+                             bool useGpu) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::createSparseMatrix(
+      height,
+      width,
+      nnz,
+      isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+      isTrans,
+      useGpu);
+  return m;
+}
+
+Matrix::~Matrix() { delete m; }
+
+size_t Matrix::getHeight() const { return m->mat->getHeight(); }
+
+size_t Matrix::getWidth() const { return m->mat->getWidth(); }
+
+float Matrix::get(size_t x, size_t y) const throw(RangeError) {
+  if (x > this->getWidth() || y > this->getHeight()) {
+    RangeError e;
+    throw e;
+  }
+  return m->mat->getElement(x, y);
+}
+
+void Matrix::set(size_t x, size_t y, float val) throw(RangeError,
+                                                      UnsupportError) {
+  if (x > this->getWidth() || y > this->getHeight()) {
+    RangeError e;
+    throw e;
+  }
+  auto rawMat = m->mat.get();
+  if (auto cDenseMat = dynamic_cast<paddle::CpuMatrix*>(rawMat)) {
+    *(cDenseMat->getData() + x + y * cDenseMat->getWidth()) = val;
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+bool Matrix::isSparse() const {
+  auto raw_mat = m->mat.get();
+  return dynamic_cast<paddle::CpuSparseMatrix*>(raw_mat) != nullptr ||
+         dynamic_cast<paddle::GpuSparseMatrix*>(raw_mat) != nullptr;
+}
+
+SparseValueType Matrix::getSparseValueType() const throw(UnsupportError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr) {
+    return (SparseValueType)cpuSparseMat->getValueType();
+  } else {
+    auto gpuSparseMat =
+        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
+    if (gpuSparseMat != nullptr) {
+      return (SparseValueType)gpuSparseMat->getValueType();
+    } else {
+      UnsupportError e;
+      throw e;
+    }
+  }
+}
+
+SparseFormatType Matrix::getSparseFormat() const throw(UnsupportError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr) {
+    return (SparseFormatType)cpuSparseMat->getFormat();
+  } else {
+    auto gpuSparseMat =
+        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
+    if (gpuSparseMat != nullptr) {
+      return SPARSE_CSR;
+    } else {
+      UnsupportError e;
+      throw e;
+    }
+  }
+}
+
+IntArray Matrix::getSparseRowCols(size_t i) const
+    throw(UnsupportError, RangeError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr &&
+      cpuSparseMat->getFormat() == paddle::SPARSE_CSR) {
+    if (i < cpuSparseMat->getHeight()) {
+      // cpuSparseMat->print(std::cout);
+      size_t len = cpuSparseMat->getColNum(i);
+      return IntArray(cpuSparseMat->getRowCols(i), len);
+    } else {
+      RangeError e;
+      throw e;
+    }
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+IntWithFloatArray Matrix::getSparseRowColsVal(size_t i) const
+    throw(UnsupportError, RangeError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr &&
+      cpuSparseMat->getValueType() == paddle::FLOAT_VALUE) {
+    if (i < cpuSparseMat->getHeight()) {
+      return IntWithFloatArray(cpuSparseMat->getRowValues(i),
+                               cpuSparseMat->getRowCols(i),
+                               cpuSparseMat->getColNum(i));
+    } else {
+      RangeError e;
+      throw e;
+    }
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+FloatArray Matrix::getData() const {
+  auto rawMat = m->mat.get();
+  if (dynamic_cast<paddle::GpuMemoryHandle*>(rawMat->getMemoryHandle().get())) {
+    // is gpu. then copy data
+    float* data = rawMat->getData();
+    size_t len = rawMat->getElementCnt();
+    float* cpuData = new float[len];
+    hl_memcpy_device2host(cpuData, data, len * sizeof(float));
+    FloatArray ret_val(cpuData, len);
+    ret_val.needFree = true;
+    return ret_val;
+  } else {
+    FloatArray ret_val(rawMat->getData(), rawMat->getElementCnt());
+    return ret_val;
+  }
+}
+
+void Matrix::sparseCopyFrom(
+    const std::vector<int>& rows,
+    const std::vector<int>& cols,
+    const std::vector<float>& vals) throw(UnsupportError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr) {
+    // LOG(INFO) <<"RowSize = "<<rows.size()
+    //  <<" ColSize = "<<cols.size()
+    //  <<" ValSize = "<<vals.size();
+    cpuSparseMat->copyFrom(const_cast<std::vector<int>&>(rows),
+                           const_cast<std::vector<int>&>(cols),
+                           const_cast<std::vector<float>&>(vals));
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+void* Matrix::getSharedPtr() const { return &m->mat; }
+
+void Matrix::toNumpyMatInplace(float** view_data,
+                               int* dim1,
+                               int* dim2) throw(UnsupportError) {
+  auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
+  if (cpuMat) {
+    *dim1 = cpuMat->getHeight();
+    *dim2 = cpuMat->getWidth();
+    *view_data = cpuMat->getData();
+  } else {
+    throw UnsupportError();
+  }
+}
+void Matrix::copyToNumpyMat(float** view_m_data,
+                            int* dim1,
+                            int* dim2) throw(UnsupportError) {
+  static_assert(sizeof(paddle::real) == sizeof(float),
+                "Currently PaddleAPI only support for single "
+                "precision version of paddle.");
+  if (this->isSparse()) {
+    throw UnsupportError();
+  } else {
+    *dim1 = m->mat->getHeight();
+    *dim2 = m->mat->getWidth();
+    *view_m_data = new float[(*dim1) * (*dim2)];
+    if (auto cpuMat = dynamic_cast<paddle::CpuMatrix*>(m->mat.get())) {
+      auto src = cpuMat->getData();
+      auto dest = *view_m_data;
+      std::memcpy(dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
+    } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
+      auto src = gpuMat->getData();
+      auto dest = *view_m_data;
+      hl_memcpy_device2host(
+          dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
+    } else {
+      LOG(WARNING) << "Unexpected Situation";
+      throw UnsupportError();
+    }
+  }
+}
+
+void Matrix::copyFromNumpyMat(float* data,
+                              int dim1,
+                              int dim2) throw(UnsupportError, RangeError) {
+  if (isSparse()) {
+    throw UnsupportError();
+  } else {
+    if (this->getHeight() == (size_t)dim1 && this->getWidth() == (size_t)dim2) {
+      if (m->mat->getData() != data) {
+        m->mat->copyFrom(data, dim1 * dim2);
+      }
+    } else {
+      throw RangeError();
+    }
+  }
+}
+
+bool Matrix::isGpu() const {
+  auto rawPtr = m->mat.get();
+  return dynamic_cast<paddle::GpuMatrix*>(rawPtr) != nullptr ||
+         dynamic_cast<paddle::GpuSparseMatrix*>(rawPtr) != nullptr;
+}
diff --git a/paddle/legacy/api/Paddle.i b/paddle/legacy/api/Paddle.i
new file mode 100644
index 0000000000000000000000000000000000000000..e6165fb10689ae3183d094a0be340aae5644c1cf
--- /dev/null
+++ b/paddle/legacy/api/Paddle.i
@@ -0,0 +1,202 @@
+%module(directors="1") swig_paddle
+%include "std_string.i"
+%{
+#define SWIG_FILE_WITH_INIT
+#include "legacy/api/PaddleAPI.h"
+%}
+
+%include "exception.i"
+%typemap(throws) UnsupportError %{
+  SWIG_exception(SWIG_RuntimeError, $1.what());
+  SWIG_fail;
+%}
+
+%include "std_vector.i"
+%include "std_pair.i"
+#ifdef SWIGPYTHON
+%include "numpy.i"
+#endif
+
+%init %{
+#ifdef SWIGPYTHON
+import_array();
+#endif
+%}
+
+
+namespace std {
+%template(vector_int) vector<int>;
+%template(vector_uint) vector<unsigned int>;
+%template(vector_float) vector<float>;
+%template(vector_string) vector<string>;
+%template(vector_vec_star) vector<Vector*>;
+}
+#ifdef SWIGPYTHON 
+%typemap(in) (int argc, char** argv) { 
+    int i = 0; 
+    if (!PyList_Check($input)) { 
+        PyErr_SetString(PyExc_ValueError, "Expecting a list"); 
+        return NULL; 
+    } 
+    $1 = PyList_Size($input); 
+    $2 = (char **) malloc(($1+1)*sizeof(char *)); 
+    for (i = 0; i < $1; i++) { 
+        PyObject *s = PyList_GetItem($input,i); 
+        if (!PyString_Check(s)) { 
+            free($2); 
+            PyErr_SetString(PyExc_ValueError, "List items must be strings"); 
+            return NULL; 
+        } 
+        $2[i] = PyString_AsString(s); 
+    } 
+    $2[i] = 0; 
+} 
+%typemap(freearg) (int argc, char** argv) { 
+    if ($2) free($2); 
+} 
+
+%typemap(out) FloatArray {
+  $result = PyList_New($1.length);
+  for (size_t i=0; i<$1.length; ++i) {
+    PyList_SetItem($result, i, PyFloat_FromDouble($1.buf[i]));
+  }  
+  if($1.needFree) {
+    delete [] $1.buf;  
+  }
+}
+
+%typemap(out) IntArray {
+  $result = PyList_New($1.length);  
+  for (size_t i=0; i<$1.length; ++i) {
+    PyList_SetItem($result, i, PyInt_FromLong($1.buf[i]));  
+  }
+  if ($1.needFree) {
+    delete [] $1.buf;  
+  }
+}
+
+%typemap(out) IntWithFloatArray {
+  $result = PyList_New($1.length);
+  for (size_t i=0; i<$1.length; ++i) {
+    PyList_SetItem($result, i, PyTuple_Pack(2, 
+      PyInt_FromLong($1.idxBuf[i]),
+      PyFloat_FromDouble($1.valBuf[i])
+    ));
+  }
+  if ($1.needFree) {
+    delete [] $1.idxBuf;
+    delete [] $1.valBuf;
+  } 
+}
+
+
+%rename(__getitem__) IVector::get;
+%rename(__setitem__) IVector::set;
+%rename(__len__) IVector::getSize;
+%rename(__getitem__) Vector::get;
+%rename(__setitem__) Vector::set;
+%rename(__len__) Vector::getSize;
+%rename(__len__) Parameter::getSize;
+%rename(__call__) ParameterTraverseCallback::apply;
+%rename(__repr__) Evaluator::toString;
+
+%apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) { 
+  (float* data, int dim1, int dim2) 
+}
+
+%apply (float** ARGOUTVIEW_ARRAY2, int* DIM1, int* DIM2) { 
+  (float** view_data, int* dim1, int* dim2) 
+}
+
+%apply (float** ARGOUTVIEWM_ARRAY2, int* DIM1, int* DIM2) {
+  (float** view_m_data, int* dim1, int* dim2)  
+}
+
+%apply (int** ARGOUTVIEWM_ARRAY1, int* DIM1) {
+  (int** view_m_data, int* dim1)  
+}
+
+%apply (int* INPLACE_ARRAY1, int DIM1) { 
+  (int* data, int dim) 
+}
+
+%apply (int** ARGOUTVIEW_ARRAY1, int* DIM1) {
+  (int** view_data, int* dim1)  
+}
+
+%apply (float* INPLACE_ARRAY1, int DIM1) {
+  (float* data, int dim)
+}
+
+%apply (float** ARGOUTVIEW_ARRAY1, int* DIM1) {
+  (float** view_data, int* dim1)
+}
+
+%apply (float** ARGOUTVIEWM_ARRAY1, int* DIM1) {
+  (float** view_m_data, int* dim1)
+}
+
+#endif
+// The below functions internally create object by "new", so it should use
+// use SWIG to handle gc. There are hints for SWIG to handle GC.
+%newobject Matrix::createZero;
+%newobject Matrix::createSparse;
+%newobject Matrix::createDense;
+%newobject Matrix::createDenseFromNumpy;
+%newobject Matrix::createCpuDenseFromNumpy;
+%newobject Matrix::createGpuDenseFromNumpy;
+%newobject Vector::createZero;
+%newobject Vector::create;
+%newobject Vector::createVectorFromNumpy;
+%newobject Vector::createCpuVectorFromNumpy;
+%newobject Vector::createGpuVectorFromNumpy;
+%newobject IVector::createZero;
+%newobject IVector::create;
+%newobject IVector::createVectorFromNumpy;
+%newobject IVector::createCpuVectorFromNumpy;
+%newobject IVector::createGpuVectorFromNumpy;
+%newobject Trainer::createByCommandLine;
+%newobject Trainer::getForwardOutput;
+%newobject Trainer::getLayerOutput;
+%newobject Arguments::getSlotValue;
+%newobject Arguments::getSlotIds;
+%newobject Arguments::getSlotIn;
+%newobject Arguments::getSlotSequenceStartPositions;
+%newobject Arguments::getSlotSequenceDim;
+%newobject Arguments::createArguments;
+%newobject GradientMachine::createByConfigProtoStr;
+%newobject GradientMachine::createByModelConfig;
+%newobject GradientMachine::asSequenceGenerator;
+%newobject GradientMachine::getParameter;
+%newobject GradientMachine::getLayerOutput;
+%newobject GradientMachine::makeEvaluator;
+%newobject TrainerConfig::createFromTrainerConfigFile;
+%newobject TrainerConfig::getModelConfig;
+%newobject TrainerConfig::getOptimizationConfig;
+%newobject Parameter::getBuf;
+%newobject Parameter::getConfig;
+%newobject ParameterOptimizer::create;
+%newobject ParameterOptimizer::needSpecialTraversal;
+%newobject ParameterUpdater::createLocalUpdater;
+%newobject ParameterUpdater::createRemoteUpdater;
+%newobject ParameterUpdater::createNewRemoteUpdater;
+
+%feature("director") UpdateCallback;
+%feature("autodoc", 1); // To generate method stub, for code hint in ide
+
+// Ignore many private class, and method cannot be handled by swig.
+%ignore MatrixPrivate;
+%ignore TrainerPrivate;
+%ignore IVector::operator[];
+%ignore ArgumentsPrivate;
+%ignore GradientMachinePrivate;
+%ignore TrainerConfigPrivate;
+%ignore ModelConfigPrivate;
+%ignore ParameterPrivate;
+%ignore SequenceGeneratorPrivate;
+%ignore VectorPrivate;
+%ignore ParameterConfigPrivate;
+%ignore OptimizationConfigPrivate;
+%ignore ParameterTraverseCallbackPrivate;
+%include "utils/GlobalConstants.h"
+%include "legacy/api/PaddleAPI.h"
diff --git a/paddle/legacy/api/PaddleAPI.h b/paddle/legacy/api/PaddleAPI.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba3e8154980b5e04bb531b8162847077ed5578df
--- /dev/null
+++ b/paddle/legacy/api/PaddleAPI.h
@@ -0,0 +1,1054 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/utils/Common.h"
+#include "paddle/utils/GlobalConstants.h"
+
+/// Import PaddlePaddle's enumeration into global namespace.
+using namespace paddle::enumeration_wrapper;  // NOLINT
+
+/**
+ * @brief Initialize paddle.
+ *
+ * In python, this method should be invoked as
+ * @code
+ *  import sys
+ *  import paddle
+ *  paddle.initPaddle(sys.argv)
+ *  or you can change arguments as any list of str.
+ * @endcode
+ */
+void initPaddle(int argc, char** argv);
+
+/// Return FLAGS_use_gpu
+bool isUsingGpu();
+
+/// Set the Flags_use_gpu to the given parameter
+void setUseGpu(bool useGpu);
+
+/// Return true if this py_paddle is compiled in GPU Version
+bool isGpuVersion();
+
+/// Return FLAGS_trainer_count
+int getTrainerCount();
+
+/// The Error of IO Operation. Such as file not found, etc.
+class IOError {};
+
+/// Out of range error
+class RangeError {};
+
+/// Not support Error, such as access GPU memory directly, etc.
+class UnsupportError : public std::runtime_error {
+ public:
+  UnsupportError() : std::runtime_error(" ") {}
+  explicit UnsupportError(const std::string& message)
+      : std::runtime_error(message) {}
+};
+
+/// This type will map to python's list of float.
+struct FloatArray {
+  const float* buf;
+  const size_t length;
+  bool needFree;  // true if the buf is dynamic alloced.
+  FloatArray(const float* b, const size_t l);
+};
+
+/// This type will map to python's list of int
+struct IntArray {
+  const int* buf;
+  const size_t length;
+  bool needFree;
+  IntArray(const int* b, const size_t l, bool f = false);
+};
+
+/// This type will map to python's list of (int, float)
+struct IntWithFloatArray {
+  const float* valBuf;
+  const int* idxBuf;
+  const size_t length;
+  bool needFree;
+  IntWithFloatArray(const float* v, const int* i, size_t l, bool f = false);
+};
+
+enum SparseValueType { SPARSE_NON_VALUE = 0, SPARSE_VALUE = 1 };
+
+enum SparseFormatType { SPARSE_CSR = 0, SPARSE_CSC = 1 };
+
+/**
+ * In Python, -1UL is hard to write. So define a const value used by python
+ * side.
+ */
+const size_t NO_SPARSE_ID = -1UL;
+
+struct MatrixPrivate;
+class Matrix {
+  Matrix();  // User Cannot Create Matrix.
+  DISABLE_COPY(Matrix);
+  static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
+
+ public:
+  virtual ~Matrix();
+
+  /**
+   * Create A Matrix with height,width, which is filled by zero.
+   */
+  static Matrix* createZero(size_t height,
+                            size_t width,
+                            bool useGpu = isUsingGpu());
+
+  /**
+   * Create Sparse Matrix.
+   *
+   * After create sparse, sparseCopyFrom can be used to fill matrix.
+   *
+   * @param nnz  Number of non zero values.
+   *
+   * @note the default sparse type is SPARSE_CSR.
+   */
+  static Matrix* createSparse(size_t height,
+                              size_t width,
+                              size_t nnz,
+                              bool isNonVal = true,
+                              bool trans = false,
+                              bool useGpu = isUsingGpu());
+
+  /**
+   * Create Dense Matrix.
+   *
+   * @param data  list of float should be passed in python.
+   * @note        the value will be copy into a new matrix.
+   */
+  static Matrix* createDense(const std::vector<float>& data,
+                             size_t height,
+                             size_t width,
+                             bool useGpu = isUsingGpu());
+
+  static Matrix* createDenseFromNumpy(
+      float* data,
+      int dim1,
+      int dim2,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
+
+  /**
+   *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
+   *
+   *  @param data  a numpy matrix.
+   *  @param dim1  dimension of data.
+   *  @param dim2  dimension of data.
+   *  @param copy  true if copy into a new matrix, false will create
+   *               matrix inplace. copy = false should be used with extreme
+   *               care because Matrix will share the memory with the given
+   *               numpy array. If the numpy array object is no longer valid,
+   *               the memory space will not be usable.
+   */
+  static Matrix* createCpuDenseFromNumpy(float* data,
+                                         int dim1,
+                                         int dim2,
+                                         bool copy = true);
+
+  /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
+  static Matrix* createGpuDenseFromNumpy(float* data, int dim1, int dim2);
+
+  /**
+   * Cast to numpy matrix.
+   *
+   * @note    This method take no parameter in python.
+   * @note    This method in python will return a numpy matrix, not void.
+   * @note    Only CpuDenseMatrix is supported.
+   *
+   * Example:
+   * @code
+   * import paddle
+   * m = paddle.Matrix.createZero(10,2)
+   * numpy_mat = m.toNumpyMat()
+   * @endcode
+   */
+  void toNumpyMatInplace(float** view_data,
+                         int* dim1,
+                         int* dim2) throw(UnsupportError);
+
+  /// Copy To numpy mat.
+  void copyToNumpyMat(float** view_m_data,
+                      int* dim1,
+                      int* dim2) throw(UnsupportError);
+
+  /// Copy From Numpy Mat
+  void copyFromNumpyMat(float* data, int dim1, int dim2) throw(UnsupportError,
+                                                               RangeError);
+
+  /// return true if this matrix is sparse.
+  bool isSparse() const;
+
+  SparseValueType getSparseValueType() const throw(UnsupportError);
+
+  SparseFormatType getSparseFormat() const throw(UnsupportError);
+
+  IntArray getSparseRowCols(size_t i) const throw(UnsupportError, RangeError);
+
+  IntWithFloatArray getSparseRowColsVal(size_t i) const
+      throw(UnsupportError, RangeError);
+
+  size_t getHeight() const;
+
+  size_t getWidth() const;
+
+  float get(size_t x, size_t y) const throw(RangeError);
+
+  void set(size_t x, size_t y, float val) throw(RangeError, UnsupportError);
+
+  /// return type is list of float
+  FloatArray getData() const;
+
+  /**
+   * Copy from rows, cols, values.
+   *
+   * if sparse_nonvalue, the values should be []
+   */
+  void sparseCopyFrom(const std::vector<int>& rows,
+                      const std::vector<int>& cols,
+                      const std::vector<float>& values =
+                          std::vector<float>()) throw(UnsupportError);
+
+  bool isGpu() const;
+
+ private:
+  void* getSharedPtr() const;
+
+  MatrixPrivate* m;
+  friend class Trainer;
+  friend class GradientMachine;
+  friend class Arguments;
+};
+
+struct VectorPrivate;
+class Vector {
+  DISABLE_COPY(Vector);
+  Vector();
+  static Vector* createByPaddleVectorPtr(void* ptr);
+
+  void* getSharedPtr();
+
+ public:
+  ~Vector();
+
+  /// Create Vector filled with zero.
+  static Vector* createZero(size_t sz, bool useGpu = isUsingGpu());
+
+  /**
+   * Create Vector from list of float.
+   *
+   * It will create a new vector, and copy data into it.
+   */
+  static Vector* create(const std::vector<float>& data,
+                        bool useGpu = isUsingGpu());
+
+  static Vector* createVectorFromNumpy(
+      float* data,
+      int dim,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
+  /**
+   * Create Cpu Vector from numpy array, which dtype=float32
+   *
+   * If copy is false, it will create vector inplace.
+   */
+  static Vector* createCpuVectorFromNumpy(float* data,
+                                          int dim,
+                                          bool copy = true);
+
+  /// Create Gpu Vector from numpy array, which dtype=float32
+  static Vector* createGpuVectorFromNumpy(float* data, int dim);
+
+  /**
+   * copy from another vector
+   * throw(RangeError) if size of src vector is different from size of this
+   * vector
+   */
+  void copyFrom(Vector* src) throw(RangeError);
+
+  /// Cast to numpy array inplace.
+  void toNumpyArrayInplace(float** view_data, int* dim1) throw(UnsupportError);
+
+  /// Copy to numpy array.
+  void copyToNumpyArray(float** view_m_data, int* dim1);
+
+  /// Copy from numpy array.
+  void copyFromNumpyArray(float* data, int dim);
+
+  /// __getitem__ in python
+  float get(const size_t idx) const throw(RangeError, UnsupportError);
+
+  /// __setitem__ in python
+  void set(const size_t idx, float val) throw(RangeError, UnsupportError);
+
+  /// Return is GPU vector or not.
+  bool isGpu() const;
+
+  /// Return a list of float, the memory is alloced and copied.
+  FloatArray getData() const;
+
+  /// __len__ in python
+  size_t getSize() const;
+
+ private:
+  VectorPrivate* m;
+
+ private:
+  friend class Parameter;
+  friend class ParameterOptimizer;
+  friend struct ParameterTraverseCallbackPrivate;
+};
+
+struct IVectorPrivate;
+class IVector {
+  IVector();
+  DISABLE_COPY(IVector);
+  static IVector* createByPaddleVectorPtr(void* ptr);
+
+ public:
+  /// Create IVector filled with zero
+  static IVector* createZero(size_t sz, bool useGpu = isUsingGpu());
+
+  /**
+   * Create IVector from list of int.
+   * It will create a new vector, and copy data into it.
+   */
+  static IVector* create(const std::vector<int>& data,
+                         bool useGpu = isUsingGpu());
+
+  static IVector* createVectorFromNumpy(
+      int* data,
+      int dim,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
+
+  /**
+   * Create Cpu IVector from numpy array, which dtype=int32
+   *
+   * If copy is false, it will create vector inplace
+   */
+  static IVector* createCpuVectorFromNumpy(int* data,
+                                           int dim,
+                                           bool copy = true);
+  /**
+   * Create Gpu IVector from numpy array, which dtype=int32
+   */
+  static IVector* createGpuVectorFromNumpy(int* data, int dim);
+
+  /// Cast to numpy array inplace.
+  void toNumpyArrayInplace(int** view_data, int* dim1) throw(UnsupportError);
+
+  /// Copy to numpy array.
+  void copyToNumpyArray(int** view_m_data, int* dim1);
+
+  /// Copy from numpy array.
+  void copyFromNumpyArray(int* data, int dim);
+
+  virtual ~IVector();
+
+  /// Return a list of int, the memory is alloced and copied.
+  IntArray getData() const;
+
+  /// This method will map to python [] method.
+  int& operator[](const size_t idx) throw(RangeError, UnsupportError);
+
+  const int& operator[](const size_t idx) const
+      throw(RangeError, UnsupportError);
+
+  inline int get(const size_t idx) const throw(RangeError, UnsupportError) {
+    return (*this)[idx];
+  }
+
+  inline void set(const size_t idx, int val) throw(RangeError, UnsupportError) {
+    (*this)[idx] = val;
+  }
+
+  /// Return true if it is gpu vector.
+  bool isGpu() const;
+
+  /// This method will map to python __len__();
+  size_t getSize() const;
+
+ private:
+  void* getSharedPtr() const;
+
+  friend class Arguments;
+  IVectorPrivate* m;
+};
+
+struct ArgumentsPrivate;
+
+/// The Arguments is actual a std::vector<paddle::Argument> in paddle.
+class Arguments {
+ private:
+  Arguments();  // Internal Create.
+  DISABLE_COPY(Arguments);
+
+ public:
+  /**
+   * Create a arguments with size.
+   * Note that it can be zero.
+   */
+  static Arguments* createArguments(size_t slotNum);
+
+  void resize(size_t slotNum);
+
+  virtual ~Arguments();
+
+  /**
+   * Return the slot number that aguments contains.
+   *
+   * It is actually the vector's size
+   */
+  size_t getSlotNum() const;
+
+  /**
+   * The get functions of Arguments
+   *
+   * the param idx is the slot id
+   */
+  Matrix* getSlotValue(size_t idx) const throw(RangeError);
+  Matrix* getSlotGrad(size_t idx) const throw(RangeError);
+  IVector* getSlotIds(size_t idx) const throw(RangeError);
+  Matrix* getSlotIn(size_t idx) const throw(RangeError);
+  IVector* getSlotSequenceStartPositions(size_t idx) const throw(RangeError);
+  IVector* getSlotSubSequenceStartPositions(size_t idx) const throw(RangeError);
+  IVector* getSlotSequenceDim(size_t idx) const throw(RangeError);
+  // End Of get functions of Arguments
+
+  int64_t getBatchSize(size_t idx = 0) const throw(RangeError);
+
+  /**
+   * The set functions of Arguments.
+   *
+   * The param idx is the slot id.
+   * The other param is the input Matrix or vector.
+   */
+  void setSlotValue(size_t idx, Matrix* mat) throw(RangeError);
+  void setSlotGrad(size_t idx, Matrix* mat) throw(RangeError);
+  void setSlotIn(size_t idx, Matrix* mat) throw(RangeError);
+  void setSlotIds(size_t idx, IVector* vec) throw(RangeError);
+  void setSlotSequenceStartPositions(size_t idx,
+                                     IVector* vec) throw(RangeError);
+  void setSlotSubSequenceStartPositions(size_t idx,
+                                        IVector* vec) throw(RangeError);
+  void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
+
+  /**
+   * Set the frame height of the idx-th Argument.
+   *
+   * @param ids The index of which Argument.
+   * @param h The height value.
+   */
+  void setSlotFrameHeight(size_t idx, size_t h) throw(RangeError);
+
+  /**
+   * Set the frame height of the idx-th Argument.
+   *
+   * @param ids The index of which Argument.
+   * @param h The height value.
+   */
+  void setSlotFrameWidth(size_t idx, size_t w) throw(RangeError);
+
+  size_t getSlotFrameHeight(size_t idx = 0) const throw(RangeError);
+  size_t getSlotFrameWidth(size_t idx = 0) const throw(RangeError);
+
+  float sum() const;
+
+ private:
+  static Arguments* createByPaddleArgumentVector(void* ptr);
+  static Arguments* createByPaddleArgument(const void* ptr);
+  void* getInternalArgumentsPtr() const;
+
+ private:
+  ArgumentsPrivate* m;
+  friend class Trainer;
+  friend class GradientMachine;
+  friend class SequenceGenerator;
+};
+
+enum GradientMatchineCreateMode {
+  CREATE_MODE_NORMAL = paddle::GradientMachine::kNormal,
+  CREATE_MODE_SGD_SPARSE_CPU_TRAINING =
+      paddle::GradientMachine::kSgdSparseCpuTraining,
+  CREATE_MODE_TESTING = paddle::GradientMachine::kTesting
+};
+
+struct ParameterConfigPrivate;
+class ParameterConfig {
+  DISABLE_COPY(ParameterConfig);
+  ParameterConfig();
+
+  /**
+   * Internal methods
+   */
+  static ParameterConfig* createParameterConfigFromParameterSharedPtr(
+      void* ptr);
+  static ParameterConfig* createParameterConfigFromParameterPtr(void* ptr);
+  void* getRawPtr();
+
+ public:
+  ~ParameterConfig();
+
+  /**
+   * return proto buf string.
+   */
+  std::string toProtoString() const;
+
+ private:
+  ParameterConfigPrivate* m;
+
+ private:
+  friend class Parameter;
+  friend class ParameterOptimizer;
+  friend struct ParameterTraverseCallbackPrivate;
+};
+
+struct OptimizationConfigPrivate;
+class OptimizationConfig {
+  DISABLE_COPY(OptimizationConfig);
+  OptimizationConfig();
+
+ public:
+  static OptimizationConfig* createFromProtoString(const std::string& str);
+  ~OptimizationConfig();
+
+  /**
+   * return protobuf string.
+   */
+  std::string toProtoString();
+
+ private:
+  OptimizationConfigPrivate* m;
+
+  friend class TrainerConfig;
+  friend class ParameterOptimizer;
+  friend class ParameterUpdater;
+  friend class Trainer;
+};
+
+struct ParameterPrivate;
+class Parameter {
+ private:
+  Parameter();
+  DISABLE_COPY(Parameter);
+
+ public:
+  virtual ~Parameter();
+
+  /**
+   * get parameter name
+   */
+  std::string getName() const;
+
+  /**
+   * get buf in Parameter
+   */
+  Vector* getBuf(ParameterType type);
+
+  /**
+   * get id
+   */
+  size_t getID() const;
+
+  ParameterConfig* getConfig();
+  void setValueUpdated();
+
+  bool save(const std::string& filename) const;
+
+  bool load(const std::string& filename) const;
+
+  size_t getSize() const;
+
+ private:
+  static Parameter* createFromRawPtr(void* ptr);
+  static Parameter* createFromSharedPtr(void* ptr);
+
+ private:
+  ParameterPrivate* m;
+  friend class UpdateCallbackWrapper;
+  friend class GradientMachine;
+  friend class ParameterUpdater;
+};
+
+struct ModelConfigPrivate;
+/**
+ * You can only get model config from TrainerConfig.
+ *
+ * It is used by GradientMachine.
+ */
+class ModelConfig {
+ private:
+  ModelConfig();
+  DISABLE_COPY(ModelConfig);
+
+ public:
+  virtual ~ModelConfig();
+
+ private:
+  ModelConfigPrivate* m;
+  friend class TrainerConfig;
+  friend struct TrainerConfigPrivate;
+  friend class GradientMachine;
+};
+
+struct TrainerConfigPrivate;
+/**
+ * To get TrainerConfig from file.
+ *
+ * It is used by GradientMachine.
+ */
+class TrainerConfig {
+ private:
+  TrainerConfig();
+  DISABLE_COPY(TrainerConfig);
+
+ public:
+  virtual ~TrainerConfig();
+
+  static TrainerConfig* createFromTrainerConfigFile(
+      const std::string& configPath);
+  static TrainerConfig* createFromProtoString(const std::string& str);
+
+  ModelConfig* getModelConfig() const;
+
+  OptimizationConfig* getOptimizationConfig() const;
+
+ private:
+  TrainerConfigPrivate* m;
+  friend class Trainer;
+};
+
+/**
+ * The callback in backword.
+ *
+ * You can inherit this class in python.
+ *
+ * @code
+ * class UpdateCallbackInPython(paddle.UpdateCallback):
+ *   def __init__(self):
+ *     paddle.UpdateCallback.__init__(self)
+ *
+ *   def apply(self, param):
+ *     assert isinstance(param, paddle.Parameter)
+ * @endcode
+ */
+class UpdateCallback {
+ public:
+  virtual ~UpdateCallback();
+  virtual void apply(Parameter* p);
+};
+
+struct ParameterTraverseCallbackPrivate;
+class ParameterTraverseCallback {
+  DISABLE_COPY(ParameterTraverseCallback);
+  ParameterTraverseCallback();
+
+ public:
+  ~ParameterTraverseCallback();
+
+  void apply(const std::vector<Vector*>& vecs,
+             const ParameterConfig& config,
+             size_t sparseId);
+
+ private:
+  ParameterTraverseCallbackPrivate* m;
+  friend class ParameterOptimizer;
+};
+
+/**
+ * The ParameterOptimizer Wrapper Class.
+ *
+ * Basically same as common/ParameterOptimizer.h
+ */
+struct ParameterOptimizerPrivate;
+class ParameterOptimizer {
+  DISABLE_COPY(ParameterOptimizer);
+  ParameterOptimizer();
+
+ public:
+  static ParameterOptimizer* create(OptimizationConfig* config);
+
+  ~ParameterOptimizer();
+
+  void init(size_t numRows, const ParameterConfig* config);
+
+  void startPass();
+
+  void finishPass();
+
+  void startBatch(size_t numSamplesProcessed);
+
+  void finishBatch();
+
+  void update(const std::vector<Vector*>& vecs,
+              const ParameterConfig& conf,
+              size_t sparseId = NO_SPARSE_ID);
+
+  std::vector<int> getParameterTypes() const;
+
+  ParameterTraverseCallback* needSpecialTraversal(
+      const ParameterConfig& config) const;
+
+ private:
+  ParameterOptimizerPrivate* m;
+};
+
+class SequenceGenerator;
+class Evaluator;
+struct GradientMachinePrivate;
+class GradientMachine {
+ private:
+  GradientMachine();
+  DISABLE_COPY(GradientMachine);
+
+ public:
+  virtual ~GradientMachine();
+
+  /**
+   * Create By ProtoStr.
+   *
+   * The ProtoStr can be generate by python's protobuf code.
+   */
+  static GradientMachine* createByConfigProtoStr(
+      const std::string& protoStr,
+      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+      const std::vector<int>& parameterTypes = defaultParamTypes);
+
+  /**
+   * Create by ModelConfig object.
+   *
+   * To get ModelConfig, you can get TrainerConfig from config file, then get
+   * model config by TrainerConfig
+   */
+  static GradientMachine* createByModelConfig(
+      ModelConfig* conf,
+      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+      const std::vector<int>& parameterTypes = defaultParamTypes);
+
+  /**
+   * @brief finish
+   */
+  void finish();
+
+  void start();
+
+  /**
+   * Prefetch row ids of sparse parameter.
+   */
+  void prefetch(const Arguments& inArgs);
+
+  /**
+   * Do some thing when train pass ended.
+   */
+  void onPassEnd();
+
+  /**
+   * The forward stage of GradientMachine.
+   *
+   * @note  the outArgs could be zero length arguemnts.
+   * @note  THIS METHOD IS VERY USEFULL FOR PREDICT FROM TRAINED MODEL.
+   */
+  void forward(const Arguments& inArgs, Arguments* outArgs, PassType passType);
+
+  /**
+   * The backward stage of GradientMachine.
+   *
+   * @note  Currently the ParameterUpdater is not wrapped in SWIG, so backward
+   * cannot actually train a network. But you can write a update callback to
+   * change the parameter or implement a ParameterUpdater in python side.
+   */
+  void backward(const UpdateCallback& callback = UpdateCallback());
+
+  /**
+   * Combine forward/backward
+   */
+  void forwardBackward(const Arguments& inArgs,
+                       Arguments* outArgs,
+                       PassType passType,
+                       const UpdateCallback& callback = UpdateCallback());
+
+  void loadParameters(const std::string& path);
+
+  size_t getParameterSize() const;
+  Parameter* getParameter(size_t i) throw(RangeError);
+
+  size_t getNonStaticParameterSize() const;
+  Parameter* getNonStaticParameter(size_t i) throw(RangeError);
+
+  void randParameters();
+
+  Arguments* getLayerOutput(const std::string& layerName) const
+      throw(UnsupportError);
+
+  /**
+   * Create a sequence generator.
+   *
+   * @note  It just like a paddle_gen_sequence.
+   */
+  SequenceGenerator* asSequenceGenerator(
+      const std::vector<std::string>& dict = std::vector<std::string>(),
+      size_t begin_id = 0UL,
+      size_t end_id = 0UL,
+      size_t max_length = 100UL,
+      size_t beam_size = -1UL);
+
+  Evaluator* makeEvaluator();
+
+  void eval(Evaluator* evaluator);
+
+ private:
+  GradientMachinePrivate* m;
+
+  static GradientMachine* createFromPaddleModelPtr(
+      const void* confPtr,
+      GradientMatchineCreateMode mode,
+      const std::vector<int>& types);
+
+  // Not to use c++ 11 init-list, so we use static var as function default arg.
+  static std::vector<int> defaultParamTypes;
+  friend class Trainer;
+  friend class ParameterUpdater;
+};
+
+struct ParameterUpdaterPrivate;
+class ParameterUpdater {
+ private:
+  ParameterUpdater();
+
+ public:
+  static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
+  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
+                                               int passCount,
+                                               bool useSparseUpdater);
+  static ParameterUpdater* createNewRemoteUpdater(
+      OptimizationConfig* config,
+      const std::string pserverSpec,
+      const bool useEtcd) throw(UnsupportError);
+  ~ParameterUpdater();
+
+  /**
+   * @brief initialize Parameter Updater by GradientMachine.
+   * @param gm
+   */
+  void init(const GradientMachine& gm);
+
+  /**
+   * @brief begin of a training/testing of one pass.
+   */
+  void startPass();
+
+  /**
+   * @brief end of a traning/testing of one pass.
+   */
+  void finishPass();
+
+  /**
+   * @brief begin of a training/testing of one batch.
+   * @param data batch's size
+   * @return PassType, mostly will be training.
+   */
+  PassType startBatch(size_t batchSize);
+
+  /**
+   * @brief end of a traning/testing of one batch
+   * @param cost current batch cost.
+   */
+  void finishBatch(float cost);
+
+  /**
+   * @brief update a parameter (by local optimizer or by cluster pserver)
+   * @param param
+   */
+  void update(Parameter* param);
+
+  /**
+   * @breif only get required sparse rows by default.
+   * @param fullSize: get full matrix parameter if *fullSize* set
+   * @param apply: get PARAMETER_APPLY on pserver if *apply* set
+   */
+  void getParametersRemote(bool fullSize = false, bool apply = false);
+
+  /**
+   * @brief restore the average parameter.
+   * @note It is only used in AverageOptimizer. Restore will get the current
+   * PARAMETER_VALUE back.
+   */
+  void restore();
+
+  /**
+   * @brief apply. Store the average parameter.
+   * @note It is only used in AverageOptimizer. Apply will store the current
+   * PARAMETER_VALUE to buffer, calcaualte current Average Parameter, and save
+   * it to PARAMETER_VALUE.
+   */
+  void apply();
+
+  /**
+   * @brief catchUpWith The Regularization will be delayed in many situations(
+   * pserver, local sparse). Catch Up means catch the regularization up, apply
+   * regularization to all params.
+   */
+  void catchUpWith();
+
+ private:
+  ParameterUpdaterPrivate* m;
+};
+
+struct EvaluatorPrivate;
+class Evaluator {
+ private:
+  Evaluator();
+  DISABLE_COPY(Evaluator);
+
+ public:
+  ~Evaluator();
+
+  /**
+   * @brief begin an evaluate stage.
+   */
+  void start();
+
+  /**
+   * @brief end an evaluate stage.
+   */
+  void finish();
+
+  /**
+   * @brief toString will get a evaluate result.
+   *
+   * __repr__ method in python
+   */
+  std::string toString();
+
+  std::vector<std::string> getNames() const;
+
+  double getValue(const std::string name) const;
+
+ private:
+  EvaluatorPrivate* m;
+
+  friend class GradientMachine;
+};
+
+struct TrainerPrivate;
+class Trainer {
+ private:
+  TrainerPrivate* m;
+  Trainer();
+  Trainer(TrainerConfig* optConfig, GradientMachine* gm);
+  DISABLE_COPY(Trainer);
+
+ public:
+  virtual ~Trainer();
+
+  /// Create A Trainer By TrainerConfig. using paddle command line.
+  static Trainer* createByCommandLine() throw(IOError);
+
+  static Trainer* create(TrainerConfig* optConfig,
+                         GradientMachine* gm) throw(IOError);
+
+  /// Start training
+  void startTrain();
+
+  /// Finish training
+  void finishTrain();
+
+  /// Start a pass.
+  void startTrainPass();
+
+  /// Finish a pass
+  void finishTrainPass();
+
+  /**
+   * Train one batch,
+   *
+   * @return true if all batch finished.
+   */
+  bool trainOneBatch(size_t batchSize);
+
+  void trainOneDataBatch(size_t batchSize, const Arguments& args);
+
+  void startTestPeriod();
+  void testOneDataBatch(size_t batchSize, const Arguments& args);
+  void finishTestPeriod();
+
+  void forwardOneBatch(size_t batchSize);
+
+  Arguments* getForwardOutput();
+
+  Arguments* getLayerOutput(const std::string& layerName) const;
+};
+
+/// the N-Best results generated from one input sequence.
+class ISequenceResults {
+ public:
+  virtual ~ISequenceResults();
+
+  /// Number of result.
+  virtual size_t getSize() const = 0;
+
+  /**
+   * Get sentence from dictionary.
+   *
+   * @param id  the index of result.
+   * @param split  if true, the return sentence will be splited with ' ' by
+   *               each word. Default is false.
+   */
+  virtual std::string getSentence(size_t id, bool split = false) const
+      throw(RangeError) = 0;
+  virtual std::vector<int> getSequence(size_t id) const throw(RangeError) = 0;
+  virtual float getScore(size_t id) const throw(RangeError) = 0;
+};
+
+struct SequenceGeneratorPrivate;
+class SequenceGenerator {
+  DISABLE_COPY(SequenceGenerator);
+  SequenceGenerator();
+
+ public:
+  virtual ~SequenceGenerator();
+
+  /**
+   * Generate Sequence by input.
+   *
+   * @note  The inArgs is just one sequence of data.
+   * @note  The return will get a N-best generate result by inArgs.
+   *        Sort by score.
+   */
+  ISequenceResults* generateSequence(const Arguments& inArgs) const;
+
+  void setDict(const std::vector<std::string>& dict);
+  void setBos(size_t bos);
+  void setEos(size_t eos);
+  void setMaxLength(size_t maxlength);
+  void setBeamSize(size_t beamSize);
+
+ private:
+  static SequenceGenerator* createByGradientMachineSharedPtr(void* ptr);
+  friend class GradientMachine;
+
+ private:
+  SequenceGeneratorPrivate* m;
+};
diff --git a/paddle/legacy/api/PaddleAPIPrivate.h b/paddle/legacy/api/PaddleAPIPrivate.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e1c504d2e8338b749e2ffbb5af5f3a3ef132c81
--- /dev/null
+++ b/paddle/legacy/api/PaddleAPIPrivate.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include "PaddleAPI.h"
+#include "paddle/legacy/gserver/evaluators/Evaluator.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
+#include "paddle/trainer/TrainerConfigHelper.h"
+
+struct GradientMachinePrivate {
+  std::shared_ptr<paddle::GradientMachine> machine;
+
+  template <typename T>
+  inline T& cast(void* ptr) {
+    return *(T*)(ptr);
+  }
+};
+
+struct OptimizationConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> trainer_config;
+  paddle::OptimizationConfig config;
+
+  const paddle::OptimizationConfig& getConfig() {
+    if (trainer_config != nullptr) {
+      return trainer_config->getOptConfig();
+    } else {
+      return config;
+    }
+  }
+};
+
+struct TrainerConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> conf;
+  TrainerConfigPrivate() {}
+};
+
+struct ModelConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> conf;
+};
+
+struct ArgumentsPrivate {
+  std::vector<paddle::Argument> outputs;
+
+  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
+    if (idx < outputs.size()) {
+      return outputs[idx];
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+
+  template <typename T>
+  std::shared_ptr<T>& cast(void* rawPtr) const {
+    return *(std::shared_ptr<T>*)(rawPtr);
+  }
+};
+
+struct ParameterUpdaterPrivate {
+  std::unique_ptr<paddle::ParameterUpdater> updater;
+};
+
+struct ParameterPrivate {
+  std::shared_ptr<paddle::Parameter> sharedPtr;
+  paddle::Parameter* rawPtr;  // rawPtr only used in ParameterUpdater,
+                              // in other situation sharedPtr should
+                              // contains value.
+
+  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
+
+  paddle::Parameter* getPtr() {
+    if (sharedPtr) {
+      return sharedPtr.get();
+    } else {
+      return rawPtr;
+    }
+  }
+};
+
+struct EvaluatorPrivate {
+  paddle::Evaluator* rawPtr;
+
+  EvaluatorPrivate() : rawPtr(nullptr) {}
+  ~EvaluatorPrivate() { delete rawPtr; }
+};
diff --git a/paddle/legacy/api/Parameter.cpp b/paddle/legacy/api/Parameter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f05740eb750cccd8cfb6cbc826a04585ec06822e
--- /dev/null
+++ b/paddle/legacy/api/Parameter.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/parameter/Parameter.h"
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+Parameter::Parameter() : m(new ParameterPrivate()) {}
+
+Parameter::~Parameter() { delete m; }
+
+Parameter* Parameter::createFromRawPtr(void* ptr) {
+  auto p = new Parameter();
+  p->m->rawPtr = *static_cast<paddle::Parameter**>(ptr);
+  return p;
+}
+
+Parameter* Parameter::createFromSharedPtr(void* ptr) {
+  auto& p = *(paddle::ParameterPtr*)(ptr);
+  if (p == nullptr) {
+    return nullptr;
+  } else {
+    auto retParam = new Parameter();
+    retParam->m->sharedPtr = p;
+    return retParam;
+  }
+}
+
+std::string Parameter::getName() const { return m->getPtr()->getName(); }
+
+Vector* Parameter::getBuf(ParameterType type) {
+  auto buf = m->getPtr()->getBuf(type);
+  return Vector::createByPaddleVectorPtr(&buf);
+}
+
+ParameterConfig* Parameter::getConfig() {
+  if (m->sharedPtr) {
+    return ParameterConfig::createParameterConfigFromParameterSharedPtr(
+        &m->sharedPtr);
+  } else {
+    return ParameterConfig::createParameterConfigFromParameterPtr(m->rawPtr);
+  }
+}
+
+size_t Parameter::getID() const { return m->getPtr()->getID(); }
+
+void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
+
+bool Parameter::save(const std::string& filename) const {
+  return m->getPtr()->save(filename);
+}
+
+bool Parameter::load(const std::string& filename) const {
+  return m->getPtr()->load(filename);
+}
+
+size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
diff --git a/paddle/legacy/api/ParameterOptimizer.cpp b/paddle/legacy/api/ParameterOptimizer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..477d9dae44362f9073639093c3c4d1cf0ac12044
--- /dev/null
+++ b/paddle/legacy/api/ParameterOptimizer.cpp
@@ -0,0 +1,124 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/parameter/ParameterOptimizer.h"
+#include <algorithm>
+#include "Internal.h"
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+struct ParameterOptimizerPrivate {
+  std::unique_ptr<paddle::ParameterOptimizer> optimizer;
+};
+
+struct ParameterTraverseCallbackPrivate {
+  paddle::ParameterOptimizer::TraverseCallback callback;
+
+  ParameterTraverseCallbackPrivate() {}
+
+  ParameterTraverseCallbackPrivate(
+      const paddle::ParameterOptimizer::TraverseCallback& callback)
+      : callback(callback) {}
+
+  void apply(const std::vector<Vector*>& vecs,
+             const ParameterConfig& conf,
+             size_t sparseId) {
+    std::vector<paddle::VectorPtr> real_vecs;
+    real_vecs.resize(vecs.size());
+    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
+      if (v) {
+        return *(paddle::VectorPtr*)(v->getSharedPtr());
+      } else {
+        return paddle::VectorPtr();
+      }
+    });
+
+    paddle::ParameterConfig& real_conf =
+        *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
+                                        .getRawPtr());
+    callback(real_vecs.data(), real_conf, sparseId);
+  }
+};
+
+ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
+
+ParameterOptimizer::~ParameterOptimizer() { delete m; }
+
+ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
+  CHECK(config != nullptr);
+  auto retOptimizer = new ParameterOptimizer();
+  retOptimizer->m->optimizer.reset(
+      paddle::ParameterOptimizer::create(config->m->getConfig(), false));
+  return retOptimizer;
+}
+
+void ParameterOptimizer::init(size_t numRows, const ParameterConfig* config) {
+  auto& conf = *(paddle::ParameterConfig*)(const_cast<ParameterConfig*>(config)
+                                               ->getRawPtr());
+  m->optimizer->init(numRows, &conf);
+}
+
+void ParameterOptimizer::startPass() { m->optimizer->startPass(); }
+
+void ParameterOptimizer::finishPass() { m->optimizer->finishPass(); }
+
+void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
+  constexpr size_t high_1 = 1UL << (sizeof(size_t) * 8 - 1);
+  CHECK_EQ(numSamplesProcessed & high_1, 0UL);  // Safely cast.
+  m->optimizer->startBatch((int64_t)numSamplesProcessed);
+}
+
+void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
+
+void ParameterOptimizer::update(const std::vector<Vector*>& vecs,
+                                const ParameterConfig& conf,
+                                size_t sparseId) {
+  ParameterTraverseCallbackPrivate invoker(
+      [&](const paddle::VectorPtr _vecs[],
+          const paddle::ParameterConfig& config,
+          size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
+  invoker.apply(vecs, conf, sparseId);
+}
+
+std::vector<int> ParameterOptimizer::getParameterTypes() const {
+  std::vector<int> returnValue;
+  staticCastVector(&returnValue, m->optimizer->getParameterTypes());
+  return returnValue;
+}
+
+ParameterTraverseCallback::ParameterTraverseCallback()
+    : m(new ParameterTraverseCallbackPrivate()) {}
+
+ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
+
+void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
+                                      const ParameterConfig& conf,
+                                      size_t sparseId) {
+  m->apply(vecs, conf, sparseId);
+}
+
+ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  auto& param_config =
+      *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(config)
+           .getRawPtr();
+  auto callback = m->optimizer->needSpecialTraversal(param_config);
+  if (callback) {
+    auto retCallback = new ParameterTraverseCallback();
+    retCallback->m->callback = callback;
+    return retCallback;
+  } else {
+    return nullptr;
+  }
+}
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/legacy/api/ParameterUpdater.cpp
similarity index 100%
rename from paddle/api/ParameterUpdater.cpp
rename to paddle/legacy/api/ParameterUpdater.cpp
diff --git a/paddle/legacy/api/SequenceGenerator.cpp b/paddle/legacy/api/SequenceGenerator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..96e075df50a2b238008ff482c8df9d31dab354d9
--- /dev/null
+++ b/paddle/legacy/api/SequenceGenerator.cpp
@@ -0,0 +1,242 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <iterator>
+#include <sstream>
+#include <vector>
+#include "PaddleAPI.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/utils/Flags.h"
+
+// used to represent partial sequence
+struct Path {
+  std::vector<int> ids;
+  float logProb;
+  paddle::MachineState machineState;
+
+  Path() { logProb = 0; }
+
+  Path(std::vector<int>& ids, float logProb, paddle::MachineState& machineState)
+      : ids(ids), logProb(logProb), machineState(machineState) {}
+
+  bool operator<(const Path& other) const { return (logProb > other.logProb); }
+};
+
+// Return top k (k == beam_size) optimal paths using beam search. The last
+// element of inArgs is the Argument of feedback. gradMachine has MaxIdLayer
+// as output and outArgs thus stores top k labels and their probabilities per
+// position
+static void findNBest(paddle::GradientMachine* gradMachine,
+                      std::vector<paddle::Argument>& inArgs,
+                      std::vector<Path>& finalPaths,
+                      size_t bos_id,
+                      size_t eos_id,
+                      size_t max_length) {
+  std::vector<Path> paths;
+  Path emptyPath;
+  paths.push_back(emptyPath);
+  finalPaths.clear();
+  gradMachine->resetState();
+  paddle::Argument feedback = inArgs.back();
+  feedback.ids->setElement(0, (int)(bos_id));
+  float minFinalPathLogProb = 0;
+  size_t beam = 0;
+  int id;
+  std::vector<paddle::Argument> outArgs;
+  while (true) {  // iterate over each generated word
+    std::vector<Path> newPaths;
+    paddle::MachineState machineState;
+    for (size_t j = 0; j < paths.size(); j++) {
+      Path& path = paths[j];
+      if (path.machineState.size() > 0) {
+        gradMachine->setState(path.machineState);
+        feedback.ids->setElement(0, path.ids.back());
+      }
+      gradMachine->forward(inArgs, &outArgs, paddle::PASS_TEST);
+      gradMachine->getState(machineState);
+      beam = outArgs[0].ids->getSize();
+      for (size_t k = 0; k < beam; k++) {
+        id = outArgs[0].ids->getElement(k);
+        float prob = outArgs[0].in->getElement(0, k);
+        std::vector<int> nids(path.ids);
+        nids.push_back(id);
+        float newLogProb = path.logProb + log(prob);
+        Path newPath(nids, newLogProb, machineState);
+        if (id == (int)eos_id || nids.size() >= max_length) {
+          finalPaths.push_back(newPath);
+          if (minFinalPathLogProb > newPath.logProb) {
+            minFinalPathLogProb = newPath.logProb;
+          }
+        } else {
+          newPaths.push_back(newPath);
+        }
+      }
+    }
+
+    if (newPaths.size() == 0) {
+      break;
+    }
+    std::nth_element(newPaths.begin(),
+                     newPaths.begin() + std::min(beam, newPaths.size()),
+                     newPaths.end());
+    if (newPaths.size() > beam) {
+      newPaths.resize(beam);
+    }
+    // pathA < pathB means pathA.logProb > pathB.logProb
+    float maxPathLogProb =
+        std::min_element(newPaths.begin(), newPaths.end())->logProb;
+    if (finalPaths.size() >= beam && minFinalPathLogProb >= maxPathLogProb) {
+      break;
+    }
+    paths = newPaths;
+  }  // end while
+
+  std::partial_sort(finalPaths.begin(),
+                    finalPaths.begin() + std::min(beam, finalPaths.size()),
+                    finalPaths.end());
+  if (finalPaths.size() > beam) {
+    finalPaths.resize(beam);
+  }
+}
+
+struct SequenceGeneratorPrivate {
+  std::shared_ptr<paddle::GradientMachine> machine;
+  std::shared_ptr<std::vector<std::string>> dict;
+  size_t beginPos;
+  size_t endPos;
+  size_t maxLength;
+
+  paddle::Argument feedback;
+
+  template <typename T>
+  inline T& cast(void* ptr) {
+    return *(T*)(ptr);
+  }
+
+  inline void findNBest(std::vector<paddle::Argument>& inArgs,
+                        std::vector<Path>& path) {
+    ::findNBest(machine.get(), inArgs, path, beginPos, endPos, maxLength);
+  }
+
+  SequenceGeneratorPrivate()
+      : dict(std::make_shared<std::vector<std::string>>()),
+        beginPos(0UL),
+        endPos(0UL),
+        maxLength(0UL),
+        feedback(__create_feedback__()) {}
+
+ private:
+  static paddle::Argument __create_feedback__() {
+    paddle::Argument feedback;
+    feedback.ids = paddle::IVector::create(/* size= */ 1, FLAGS_use_gpu);
+
+    feedback.sequenceStartPositions =
+        paddle::ICpuGpuVector::create(/* size= */ 2, /* useGpu= */ false);
+    feedback.sequenceStartPositions->getMutableData(false)[0] = 0;
+    feedback.sequenceStartPositions->getMutableData(false)[1] = 1;
+    return feedback;
+  }
+};
+
+SequenceGenerator::SequenceGenerator() : m(new SequenceGeneratorPrivate()) {}
+
+SequenceGenerator::~SequenceGenerator() { delete m; }
+
+class PathSequenceResults : public ISequenceResults {
+  // ISequenceResults interface
+ public:
+  PathSequenceResults(const std::shared_ptr<std::vector<Path>>& path,
+                      const std::shared_ptr<std::vector<std::string>>& dict)
+      : path_(path), dict_(dict) {}
+
+  size_t getSize() const { return path_->size(); }
+  std::string getSentence(size_t id, bool split) const throw(RangeError) {
+    if (id < getSize()) {
+      Path& p = (*path_)[id];
+      std::ostringstream sout;
+      std::transform(p.ids.begin(),
+                     p.ids.end(),
+                     std::ostream_iterator<std::string>(sout, split ? " " : ""),
+                     [&](int id) { return (*dict_)[id]; });
+      return sout.str();
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+  std::vector<int> getSequence(size_t id) const throw(RangeError) {
+    if (id < getSize()) {
+      Path& p = (*path_)[id];
+      return p.ids;
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+  float getScore(size_t id) const throw(RangeError) {
+    if (id < getSize()) {
+      Path& p = (*path_)[id];
+      return p.logProb;
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+
+ private:
+  std::shared_ptr<std::vector<Path>> path_;
+  std::shared_ptr<std::vector<std::string>> dict_;
+};
+
+ISequenceResults* SequenceGenerator::generateSequence(
+    const Arguments& inArgs) const {
+  auto& in_args =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  for (auto& arg : in_args) {
+    arg.sequenceStartPositions = m->feedback.sequenceStartPositions;
+  }
+  in_args.push_back(m->feedback);
+  auto path = std::make_shared<std::vector<Path>>();
+  m->findNBest(in_args, *path);
+  return new PathSequenceResults(path, m->dict);
+}
+
+SequenceGenerator* SequenceGenerator::createByGradientMachineSharedPtr(
+    void* ptr) {
+  SequenceGenerator* r = new SequenceGenerator();
+  r->m->machine = r->m->cast<std::shared_ptr<paddle::GradientMachine>>(ptr);
+  return r;
+}
+
+void SequenceGenerator::setDict(const std::vector<std::string>& dict) {
+  *m->dict = dict;
+}
+
+void SequenceGenerator::setBos(size_t bos) { m->beginPos = bos; }
+
+void SequenceGenerator::setEos(size_t eos) { m->endPos = eos; }
+
+void SequenceGenerator::setMaxLength(size_t maxLength) {
+  m->maxLength = maxLength;
+}
+
+void SequenceGenerator::setBeamSize(size_t beamSize) {
+  if (beamSize != -1UL) {
+    FLAGS_beam_size = beamSize;
+  }
+}
+
+ISequenceResults::~ISequenceResults() {}
diff --git a/paddle/legacy/api/Trainer.cpp b/paddle/legacy/api/Trainer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6506acb738c0fff5f1637811330119c57a7ca03a
--- /dev/null
+++ b/paddle/legacy/api/Trainer.cpp
@@ -0,0 +1,175 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+#include <stdlib.h>
+#include <atomic>
+#include <memory>
+
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/trainer/ParamUtil.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/trainer/TrainerInternal.h"
+#include "paddle/utils/Flags.h"
+
+using paddle::real;
+
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_int32(start_pass);
+
+struct TrainerPrivate : public paddle::Trainer {
+  bool _trainOneBatch(size_t batchSize);
+  bool forwardOneBatch(size_t batchSize);
+  void forwardOneDataBatch(const std::vector<paddle::Argument>& inArgs);
+  void setBatchSize(size_t batchSize);
+  std::vector<paddle::Argument>& getForwardOutput();
+
+  void startTestPeriod();
+  void finishTestPeriod();
+  void testOneDataBatch(const paddle::DataBatch& dataBatch);
+  TrainerPrivate() : paddle::Trainer() {}
+};
+
+Trainer::Trainer() : m(new TrainerPrivate()) {
+  auto conf = paddle::TrainerConfigHelper::createFromFlags();
+  if (conf != nullptr) {
+    m->init(conf);
+  }
+}
+
+Trainer::~Trainer() { delete m; }
+
+Trainer* Trainer::createByCommandLine() throw(IOError) {
+  auto retv = new Trainer();
+  if (retv->m->getConfig().IsInitialized()) {
+    return retv;
+  } else {
+    throw IOError();
+  }
+}
+
+Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
+    : m(new TrainerPrivate()) {
+  m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
+}
+
+Trainer* Trainer::create(TrainerConfig* config,
+                         GradientMachine* gm) throw(IOError) {
+  auto retv = new Trainer(config, gm);
+  if (retv->m->getConfig().IsInitialized()) {
+    return retv;
+  } else {
+    retv->m->getConfig().CheckInitialized();
+    throw IOError();
+  }
+}
+
+void Trainer::startTrain() { m->startTrain(); }
+
+void Trainer::finishTrain() { m->finishTrain(); }
+
+void Trainer::startTrainPass() { m->startTrainPass(); }
+
+void Trainer::finishTrainPass() { m->finishTrainPass(); }
+
+void Trainer::trainOneDataBatch(size_t batchSize, const Arguments& inArgs) {
+  paddle::DataBatch dataBatch;
+  dataBatch.getStreams() = inArgs.m->outputs;
+  dataBatch.setSize(batchSize);
+  m->trainOneDataBatch(dataBatch);
+}
+
+bool Trainer::trainOneBatch(size_t batchSize) {
+  return m->_trainOneBatch(batchSize);
+}
+
+bool TrainerPrivate::_trainOneBatch(size_t batchSize) {
+  paddle::DataBatch dataBatch;
+  CHECK(dataProvider_) << "data_provider is not specified";
+  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  if (num == 0) {
+    return false;
+  }
+  trainOneDataBatch(dataBatch);
+  return false;
+}
+
+void TrainerPrivate::startTestPeriod() {
+  if (!tester_) {
+    createTester();
+  }
+  tester_->startTestPeriod();
+}
+
+void Trainer::startTestPeriod() { m->startTestPeriod(); }
+
+void TrainerPrivate::testOneDataBatch(const paddle::DataBatch& dataBatch) {
+  tester_->testOneDataBatch(dataBatch, &forwardOutput_);
+}
+
+void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
+  paddle::DataBatch dataBatch;
+  dataBatch.getStreams() = args.m->outputs;
+  dataBatch.setSize(batchSize);
+  m->testOneDataBatch(dataBatch);
+}
+
+void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
+void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
+
+Arguments* Trainer::getLayerOutput(const std::string& layerName) const {
+  auto nn = this->m->getGradientMachine();
+  CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
+  auto arg = nn->getLayerOutput(layerName);
+  return Arguments::createByPaddleArgument(&arg);
+}
+
+void Trainer::forwardOneBatch(size_t batchSize) {
+  m->forwardOneBatch(batchSize);
+}
+
+bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
+  CHECK(dataProvider_) << "data_provider is not specified";
+  paddle::DataBatch dataBatch;
+  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  if (num == 0) {
+    return false;
+  }
+
+  forwardOneDataBatch(dataBatch.getStreams());
+  return true;
+}
+
+void TrainerPrivate::forwardOneDataBatch(
+    const std::vector<paddle::Argument>& inArgs) {
+  std::vector<paddle::Argument>& outArgs = forwardOutput_;
+
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    trainerInternal_.getGradientMachine()->prefetch(inArgs);
+    trainerInternal_.getParameterUpdater()->getParametersRemote();
+  }
+  trainerInternal_.getGradientMachine()->forward(
+      inArgs, &outArgs, paddle::PASS_TEST);
+}
+
+Arguments* Trainer::getForwardOutput() {
+  return Arguments::createByPaddleArgumentVector(&m->getForwardOutput());
+}
+
+std::vector<paddle::Argument>& TrainerPrivate::getForwardOutput() {
+  return forwardOutput_;
+}
diff --git a/paddle/legacy/api/Util.cpp b/paddle/legacy/api/Util.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d98daadbdecadd690ebf07db52372c0dd664af4a
--- /dev/null
+++ b/paddle/legacy/api/Util.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/utils/Common.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Util.h"
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+
+void initPaddle(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+}
+
+FloatArray::FloatArray(const float* b, const size_t l)
+    : buf(b), length(l), needFree(false) {}
+
+IntArray::IntArray(const int* b, const size_t l, bool f)
+    : buf(b), length(l), needFree(f) {}
+
+IntWithFloatArray::IntWithFloatArray(const float* v,
+                                     const int* i,
+                                     size_t l,
+                                     bool f)
+    : valBuf(v), idxBuf(i), length(l), needFree(f) {}
+
+bool isUsingGpu() { return FLAGS_use_gpu; }
+
+void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
+
+bool isGpuVersion() {
+#ifndef PADDLE_WITH_CUDA
+  return false;
+#else
+  return true;
+#endif
+}
+
+int getTrainerCount() { return FLAGS_trainer_count; }
+
+static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
+              "The Parameter Type should be same in core/api and core/common");
diff --git a/paddle/legacy/api/Vector.cpp b/paddle/legacy/api/Vector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..73b6d3a15d6d0ddc80a17846604d9500d8f7e4e3
--- /dev/null
+++ b/paddle/legacy/api/Vector.cpp
@@ -0,0 +1,304 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+
+#include "paddle/legacy/math/Vector.h"
+
+#include <cstring>
+
+struct IVectorPrivate {
+  paddle::IVectorPtr vec;
+};
+
+IVector::IVector() : m(new IVectorPrivate()) {}
+
+IVector* IVector::createZero(size_t sz, bool useGpu) {
+  auto v = new IVector();
+  v->m->vec = paddle::IVector::create(sz, useGpu);
+  v->m->vec->zeroMem();
+  return v;
+}
+
+IVector* IVector::create(const std::vector<int>& data, bool useGpu) {
+  auto v = new IVector();
+  v->m->vec = paddle::IVector::create(data.size(), useGpu);
+  v->m->vec->copyFrom(data.data(), data.size());
+  return v;
+}
+
+IVector* IVector::createVectorFromNumpy(int* data,
+                                        int dim,
+                                        bool copy,
+                                        bool useGpu) throw(UnsupportError) {
+  if (useGpu) {
+    /// if use gpu only copy=true is supported
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return IVector::createGpuVectorFromNumpy(data, dim);
+  } else {
+    return IVector::createCpuVectorFromNumpy(data, dim, copy);
+  }
+}
+
+IVector* IVector::createCpuVectorFromNumpy(int* data, int dim, bool copy) {
+  auto v = new IVector();
+  if (copy) {
+    v->m->vec = paddle::IVector::create(dim, false);
+    v->m->vec->copyFrom(data, dim);
+  } else {
+    v->m->vec = paddle::IVector::create(data, dim, false);
+  }
+  return v;
+}
+
+IVector* IVector::createGpuVectorFromNumpy(int* data, int dim) {
+  auto v = new IVector();
+  v->m->vec = paddle::IVector::create(dim, true);
+  v->m->vec->copyFrom(data, dim);
+  return v;
+}
+
+bool IVector::isGpu() const {
+  return dynamic_cast<paddle::GpuIVector*>(m->vec.get()) != nullptr;
+}
+
+IntArray IVector::getData() const {
+  if (this->isGpu()) {
+    int* src = m->vec->getData();
+    size_t len = m->vec->getSize();
+    int* dest = new int[len];
+    hl_memcpy_device2host(dest, src, len * sizeof(int));
+    return IntArray(dest, len, true);
+  } else {
+    return IntArray(m->vec->getData(), m->vec->getSize());
+  }
+}
+
+int& IVector::operator[](const size_t idx) throw(RangeError, UnsupportError) {
+  if (this->isGpu()) {
+    UnsupportError e;
+    throw e;
+  } else {
+    if (idx >= m->vec->getSize()) {
+      RangeError e;
+      throw e;
+    }
+  }
+  return m->vec->getData()[idx];
+}
+
+const int& IVector::operator[](const size_t idx) const
+    throw(RangeError, UnsupportError) {
+  return (*const_cast<IVector*>(this))[idx];
+}
+
+IVector* IVector::createByPaddleVectorPtr(void* ptr) {
+  auto* p = (paddle::IVectorPtr*)ptr;
+  if ((*p) != nullptr) {
+    IVector* vec = new IVector();
+    vec->m->vec = *p;
+    return vec;
+  } else {
+    return nullptr;
+  }
+}
+
+IVector::~IVector() { delete m; }
+
+void* IVector::getSharedPtr() const { return &m->vec; }
+
+size_t IVector::getSize() const { return m->vec->getSize(); }
+
+void IVector::toNumpyArrayInplace(int** data, int* dim1) throw(UnsupportError) {
+  auto v = std::dynamic_pointer_cast<paddle::CpuIVector>(m->vec);
+  if (v) {
+    *data = v->getData();
+    *dim1 = v->getSize();
+  } else {
+    throw UnsupportError();
+  }
+}
+
+void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
+  *dim1 = m->vec->getSize();
+  *view_m_data = new int[*dim1];
+  if (auto cpuVec = dynamic_cast<paddle::CpuIVector*>(m->vec.get())) {
+    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
+  } else if (auto gpuVec = dynamic_cast<paddle::GpuIVector*>(m->vec.get())) {
+    hl_memcpy_device2host(
+        *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
+  } else {
+    LOG(INFO) << "Unexpected situation";
+  }
+}
+
+void IVector::copyFromNumpyArray(int* data, int dim) {
+  m->vec->resize(dim);
+  m->vec->copyFrom(data, dim);
+}
+
+struct VectorPrivate {
+  paddle::VectorPtr vec;
+
+  void safeAccessData(const size_t idx,
+                      const std::function<void(float&)>& func) const
+      throw(RangeError, UnsupportError) {
+    auto cpuVec = std::dynamic_pointer_cast<const paddle::CpuVector>(vec);
+    if (cpuVec != nullptr) {
+      if (idx < vec->getSize()) {
+        func(vec->getData()[idx]);
+      } else {
+        throw RangeError();
+      }
+    } else {
+      throw UnsupportError();
+    }
+  }
+};
+
+Vector::Vector() : m(new VectorPrivate()) {}
+
+Vector::~Vector() { delete m; }
+
+Vector* Vector::createZero(size_t sz, bool useGpu) {
+  auto retVec = new Vector();
+  retVec->m->vec = paddle::Vector::create(sz, useGpu);
+  retVec->m->vec->zero();
+  return retVec;
+}
+
+Vector* Vector::create(const std::vector<float>& data, bool useGpu) {
+  auto retVec = new Vector();
+  retVec->m->vec = paddle::Vector::create(data.size(), useGpu);
+  retVec->m->vec->copyFrom(data.data(), data.size());
+  return retVec;
+}
+
+Vector* Vector::createByPaddleVectorPtr(void* ptr) {
+  auto& v = *(paddle::VectorPtr*)(ptr);
+  if (v == nullptr) {
+    return nullptr;
+  } else {
+    auto retVec = new Vector();
+    retVec->m->vec = v;
+    return retVec;
+  }
+}
+
+Vector* Vector::createVectorFromNumpy(float* data,
+                                      int dim,
+                                      bool copy,
+                                      bool useGpu) throw(UnsupportError) {
+  if (useGpu) {
+    /// if use gpu only copy=True is supported
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return Vector::createGpuVectorFromNumpy(data, dim);
+  } else {
+    return Vector::createCpuVectorFromNumpy(data, dim, copy);
+  }
+}
+
+Vector* Vector::createCpuVectorFromNumpy(float* data, int dim, bool copy) {
+  CHECK_GT(dim, 0);
+  auto retVec = new Vector();
+  if (copy) {
+    retVec->m->vec = paddle::Vector::create((size_t)dim, false);
+    retVec->m->vec->copyFrom(data, dim);
+  } else {
+    retVec->m->vec = paddle::Vector::create(data, (size_t)dim, false);
+  }
+  return retVec;
+}
+
+Vector* Vector::createGpuVectorFromNumpy(float* data, int dim) {
+  CHECK_GT(dim, 0);
+  auto retVec = new Vector();
+  retVec->m->vec = paddle::Vector::create((size_t)dim, true);
+  retVec->m->vec->copyFrom(data, (size_t)dim);
+  return retVec;
+}
+
+void Vector::toNumpyArrayInplace(float** view_data,
+                                 int* dim1) throw(UnsupportError) {
+  auto v = std::dynamic_pointer_cast<paddle::CpuVector>(m->vec);
+  if (v != nullptr) {
+    *view_data = v->getData();
+    *dim1 = (int)v->getSize();
+  } else {
+    throw UnsupportError();
+  }
+}
+
+void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
+  *dim1 = m->vec->getSize();
+  *view_m_data = new float[*dim1];
+  if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
+    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
+  } else if (auto gpuVec = dynamic_cast<paddle::GpuVector*>(m->vec.get())) {
+    hl_memcpy_device2host(
+        *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
+  } else {
+    LOG(INFO) << "Unexpected situation";
+  }
+}
+
+void Vector::copyFromNumpyArray(float* data, int dim) {
+  m->vec->resize(dim);
+  m->vec->copyFrom(data, dim);
+}
+
+FloatArray Vector::getData() const {
+  if (this->isGpu()) {
+    float* src = m->vec->getData();
+    size_t len = m->vec->getSize();
+    float* dest = new float[len];
+    hl_memcpy_device2host(dest, src, len * sizeof(float));
+    FloatArray ret_val(dest, len);
+    ret_val.needFree = true;
+    return ret_val;
+  } else {
+    FloatArray ret_val(m->vec->getData(), m->vec->getSize());
+    return ret_val;
+  }
+}
+
+void Vector::copyFrom(Vector* src) throw(RangeError) {
+  if (src->m->vec->getSize() != m->vec->getSize()) {
+    throw RangeError();
+  }
+  m->vec->copyFrom(*src->m->vec);
+}
+
+bool Vector::isGpu() const {
+  return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
+}
+
+float Vector::get(const size_t idx) const throw(RangeError, UnsupportError) {
+  float r;
+  m->safeAccessData(idx, [&](float& o) { r = o; });
+  return r;
+}
+
+void Vector::set(const size_t idx, float val) throw(RangeError,
+                                                    UnsupportError) {
+  m->safeAccessData(idx, [&](float& o) { o = val; });
+}
+
+size_t Vector::getSize() const { return m->vec->getSize(); }
+
+void* Vector::getSharedPtr() { return &m->vec; }
diff --git a/paddle/api/__init__.py b/paddle/legacy/api/__init__.py
similarity index 100%
rename from paddle/api/__init__.py
rename to paddle/legacy/api/__init__.py
diff --git a/paddle/api/numpy.i b/paddle/legacy/api/numpy.i
similarity index 100%
rename from paddle/api/numpy.i
rename to paddle/legacy/api/numpy.i
diff --git a/paddle/api/test/.gitignore b/paddle/legacy/api/test/.gitignore
similarity index 100%
rename from paddle/api/test/.gitignore
rename to paddle/legacy/api/test/.gitignore
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/legacy/api/test/CMakeLists.txt
similarity index 100%
rename from paddle/api/test/CMakeLists.txt
rename to paddle/legacy/api/test/CMakeLists.txt
diff --git a/paddle/api/test/testArguments.py b/paddle/legacy/api/test/testArguments.py
similarity index 100%
rename from paddle/api/test/testArguments.py
rename to paddle/legacy/api/test/testArguments.py
diff --git a/paddle/api/test/testGradientMachine.py b/paddle/legacy/api/test/testGradientMachine.py
similarity index 100%
rename from paddle/api/test/testGradientMachine.py
rename to paddle/legacy/api/test/testGradientMachine.py
diff --git a/paddle/api/test/testMatrix.py b/paddle/legacy/api/test/testMatrix.py
similarity index 100%
rename from paddle/api/test/testMatrix.py
rename to paddle/legacy/api/test/testMatrix.py
diff --git a/paddle/api/test/testTrain.py b/paddle/legacy/api/test/testTrain.py
similarity index 100%
rename from paddle/api/test/testTrain.py
rename to paddle/legacy/api/test/testTrain.py
diff --git a/paddle/api/test/testTrainConfig.py b/paddle/legacy/api/test/testTrainConfig.py
similarity index 100%
rename from paddle/api/test/testTrainConfig.py
rename to paddle/legacy/api/test/testTrainConfig.py
diff --git a/paddle/api/test/testTrainer.py b/paddle/legacy/api/test/testTrainer.py
similarity index 100%
rename from paddle/api/test/testTrainer.py
rename to paddle/legacy/api/test/testTrainer.py
diff --git a/paddle/api/test/testVector.py b/paddle/legacy/api/test/testVector.py
similarity index 100%
rename from paddle/api/test/testVector.py
rename to paddle/legacy/api/test/testVector.py
diff --git a/paddle/api/test/util.py b/paddle/legacy/api/test/util.py
similarity index 100%
rename from paddle/api/test/util.py
rename to paddle/legacy/api/test/util.py
diff --git a/paddle/capi/Arguments.cpp b/paddle/legacy/capi/Arguments.cpp
similarity index 100%
rename from paddle/capi/Arguments.cpp
rename to paddle/legacy/capi/Arguments.cpp
diff --git a/paddle/capi/CMakeLists.txt b/paddle/legacy/capi/CMakeLists.txt
similarity index 100%
rename from paddle/capi/CMakeLists.txt
rename to paddle/legacy/capi/CMakeLists.txt
diff --git a/paddle/capi/Main.cpp b/paddle/legacy/capi/Main.cpp
similarity index 100%
rename from paddle/capi/Main.cpp
rename to paddle/legacy/capi/Main.cpp
diff --git a/paddle/capi/Matrix.cpp b/paddle/legacy/capi/Matrix.cpp
similarity index 100%
rename from paddle/capi/Matrix.cpp
rename to paddle/legacy/capi/Matrix.cpp
diff --git a/paddle/capi/Vector.cpp b/paddle/legacy/capi/Vector.cpp
similarity index 100%
rename from paddle/capi/Vector.cpp
rename to paddle/legacy/capi/Vector.cpp
diff --git a/paddle/capi/arguments.h b/paddle/legacy/capi/arguments.h
similarity index 100%
rename from paddle/capi/arguments.h
rename to paddle/legacy/capi/arguments.h
diff --git a/paddle/capi/capi.h b/paddle/legacy/capi/capi.h
similarity index 100%
rename from paddle/capi/capi.h
rename to paddle/legacy/capi/capi.h
diff --git a/paddle/legacy/capi/capi_private.h b/paddle/legacy/capi/capi_private.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5f8c8c5c8bd506f9c8f49ee7d03f9b20460efdb
--- /dev/null
+++ b/paddle/legacy/capi/capi_private.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Argument.h"
+#pragma once
+
+namespace paddle {
+namespace capi {
+
+enum CType { kIVECTOR = 0, kMATRIX, kARGUMENTS, kGRADIENT_MACHINE };
+
+#define STRUCT_HEADER CType type;
+
+struct CHeader {
+  STRUCT_HEADER
+};
+
+struct CIVector {
+  STRUCT_HEADER
+  IVectorPtr vec;
+
+  CIVector() : type(kIVECTOR) {}
+};
+
+struct CMatrix {
+  STRUCT_HEADER
+  MatrixPtr mat;
+
+  CMatrix() : type(kMATRIX) {}
+};
+
+struct CArguments {
+  STRUCT_HEADER
+  std::vector<paddle::Argument> args;
+
+  CArguments() : type(kARGUMENTS) {}
+
+  template <typename T>
+  paddle_error accessSeqPos(uint64_t ID, uint32_t nestedLevel, T callback) {
+    if (ID >= args.size()) return kPD_OUT_OF_RANGE;
+    switch (nestedLevel) {
+      case 0:
+        callback(args[ID].sequenceStartPositions);
+        break;
+      case 1:
+        callback(args[ID].subSequenceStartPositions);
+        break;
+      default:
+        return kPD_OUT_OF_RANGE;
+    }
+    return kPD_NO_ERROR;
+  }
+};
+
+struct CGradientMachine {
+  STRUCT_HEADER
+  paddle::GradientMachinePtr machine;
+
+  CGradientMachine() : type(kGRADIENT_MACHINE) {}
+};
+
+template <typename T>
+inline T* cast(void* ptr) {
+  return reinterpret_cast<T*>(ptr);
+}
+}  // namespace capi
+}  // namespace paddle
diff --git a/paddle/capi/config.h.in b/paddle/legacy/capi/config.h.in
similarity index 100%
rename from paddle/capi/config.h.in
rename to paddle/legacy/capi/config.h.in
diff --git a/paddle/capi/error.cpp b/paddle/legacy/capi/error.cpp
similarity index 100%
rename from paddle/capi/error.cpp
rename to paddle/legacy/capi/error.cpp
diff --git a/paddle/capi/error.h b/paddle/legacy/capi/error.h
similarity index 100%
rename from paddle/capi/error.h
rename to paddle/legacy/capi/error.h
diff --git a/paddle/capi/examples/.gitignore b/paddle/legacy/capi/examples/.gitignore
similarity index 100%
rename from paddle/capi/examples/.gitignore
rename to paddle/legacy/capi/examples/.gitignore
diff --git a/paddle/capi/examples/README.md b/paddle/legacy/capi/examples/README.md
similarity index 100%
rename from paddle/capi/examples/README.md
rename to paddle/legacy/capi/examples/README.md
diff --git a/paddle/capi/examples/model_inference/README.md b/paddle/legacy/capi/examples/model_inference/README.md
similarity index 100%
rename from paddle/capi/examples/model_inference/README.md
rename to paddle/legacy/capi/examples/model_inference/README.md
diff --git a/paddle/capi/examples/model_inference/common/common.h b/paddle/legacy/capi/examples/model_inference/common/common.h
similarity index 100%
rename from paddle/capi/examples/model_inference/common/common.h
rename to paddle/legacy/capi/examples/model_inference/common/common.h
diff --git a/paddle/capi/examples/model_inference/dense/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/dense/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/legacy/capi/examples/model_inference/dense/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/main.c
rename to paddle/legacy/capi/examples/model_inference/dense/main.c
diff --git a/paddle/capi/examples/model_inference/dense/merge_v2_model.py b/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/merge_v2_model.py
rename to paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py
diff --git a/paddle/capi/examples/model_inference/dense/mnist_v2.py b/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/mnist_v2.py
rename to paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py
diff --git a/paddle/capi/examples/model_inference/dense/trainer_config.py b/paddle/legacy/capi/examples/model_inference/dense/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/dense/trainer_config.py
diff --git a/paddle/capi/examples/model_inference/multi_thread/.gitignore b/paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/.gitignore
rename to paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore
diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/multi_thread/main.c b/paddle/legacy/capi/examples/model_inference/multi_thread/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/main.c
rename to paddle/legacy/capi/examples/model_inference/multi_thread/main.c
diff --git a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/main_gpu.c
rename to paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c
diff --git a/paddle/capi/examples/model_inference/multi_thread/trainer_config.py b/paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py
diff --git a/paddle/capi/examples/model_inference/sequence/.gitignore b/paddle/legacy/capi/examples/model_inference/sequence/.gitignore
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/.gitignore
rename to paddle/legacy/capi/examples/model_inference/sequence/.gitignore
diff --git a/paddle/capi/examples/model_inference/sequence/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/sequence/main.c b/paddle/legacy/capi/examples/model_inference/sequence/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/main.c
rename to paddle/legacy/capi/examples/model_inference/sequence/main.c
diff --git a/paddle/capi/examples/model_inference/sequence/trainer_config.py b/paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py
diff --git a/paddle/capi/examples/model_inference/sparse_binary/.gitignore b/paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/.gitignore
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore
diff --git a/paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/sparse_binary/main.c b/paddle/legacy/capi/examples/model_inference/sparse_binary/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/main.c
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/main.c
diff --git a/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py b/paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py
diff --git a/paddle/legacy/capi/gradient_machine.cpp b/paddle/legacy/capi/gradient_machine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c5ddd856b5d374ae90d6c8ef898be52aa2e4e89
--- /dev/null
+++ b/paddle/legacy/capi/gradient_machine.cpp
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gradient_machine.h"
+#include "capi_private.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+
+#define cast(v) paddle::capi::cast<paddle::capi::CGradientMachine>(v)
+
+enum GradientMatchineCreateMode {
+  CREATE_MODE_NORMAL = 0,
+  CREATE_MODE_TESTING = 4
+};
+
+namespace paddle {
+
+class MyNeuralNetwork : public NeuralNetwork {
+ public:
+  MyNeuralNetwork(const std::string& name, NeuralNetwork* network)
+      : NeuralNetwork(name, network) {}
+};
+
+NeuralNetwork* newCustomNerualNetwork(const std::string& name,
+                                      NeuralNetwork* network) {
+  return new MyNeuralNetwork(name, network);
+}
+}  // namespace paddle
+
+extern "C" {
+paddle_error paddle_gradient_machine_create_for_inference(
+    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size) {
+  if (modelConfigProtobuf == nullptr) return kPD_NULLPTR;
+  paddle::ModelConfig config;
+  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
+      !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+
+  auto ptr = new paddle::capi::CGradientMachine();
+  ptr->machine.reset(paddle::GradientMachine::create(
+      config, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
+  *machine = ptr;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_create_for_inference_with_parameters(
+    paddle_gradient_machine* machine, void* mergedModel, uint64_t size) {
+  if (mergedModel == nullptr) return kPD_NULLPTR;
+  std::istringstream is(std::string(static_cast<char*>(mergedModel), size));
+  int64_t modelConfigSize = 0;
+  is.read((char*)(&modelConfigSize), sizeof(modelConfigSize));
+  std::string modelConfigProtobuf;
+  modelConfigProtobuf.resize(modelConfigSize);
+  is.read(&modelConfigProtobuf[0], modelConfigSize);
+  paddle::TrainerConfig config;
+  paddle::ModelConfig modelConfig;
+  if (!config.ParseFromString(modelConfigProtobuf) || !config.IsInitialized()) {
+    if (!modelConfig.ParseFromString(modelConfigProtobuf) ||
+        !modelConfig.IsInitialized()) {
+      return kPD_PROTOBUF_ERROR;
+    }
+  } else {
+    modelConfig = config.model_config();
+  }
+  auto ptr = new paddle::capi::CGradientMachine();
+  ptr->machine.reset(paddle::GradientMachine::create(
+      modelConfig, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
+  std::vector<paddle::ParameterPtr>& parameters = ptr->machine->getParameters();
+  for (auto& para : parameters) {
+    para->load(is);
+  }
+
+  *machine = ptr;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
+  delete cast(machine);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_load_parameter_from_disk(
+    paddle_gradient_machine machine, const char* path) {
+  auto m = cast(machine);
+  if (m == nullptr || path == nullptr || m->machine == nullptr)
+    return kPD_NULLPTR;
+  m->machine->loadParameters(path);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_forward(paddle_gradient_machine machine,
+                                             paddle_arguments inArgs,
+                                             paddle_arguments outArgs,
+                                             bool isTrain) {
+  auto m = cast(machine);
+  auto in = paddle::capi::cast<paddle::capi::CArguments>(inArgs);
+  auto out = paddle::capi::cast<paddle::capi::CArguments>(outArgs);
+  if (m == nullptr || in == nullptr || out == nullptr || m->machine == nullptr)
+    return kPD_NULLPTR;
+  m->machine->forward(
+      in->args, &out->args, isTrain ? paddle::PASS_TRAIN : paddle::PASS_TEST);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_create_shared_param(
+    paddle_gradient_machine origin,
+    void* modelConfigProtobuf,
+    int size,
+    paddle_gradient_machine* slave) {
+  auto o = cast(origin);
+  if (origin == nullptr || slave == nullptr || o->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+  paddle::ModelConfig config;
+  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
+      !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+
+  std::unique_ptr<paddle::capi::CGradientMachine> ptr(
+      new paddle::capi::CGradientMachine());
+  auto nn = paddle::NeuralNetwork::create(config);
+  nn->init(config,
+           [&o](int paramId, paddle::Parameter* param) {
+             auto p = o->machine->getParameters()[paramId];
+             param->enableSharedType(paddle::PARAMETER_VALUE,
+                                     p->getBuf(paddle::PARAMETER_VALUE));
+           },
+           {paddle::PARAMETER_VALUE},
+           false);
+  ptr->machine.reset(nn);
+  *slave = ptr.release();
+  return kPD_NO_ERROR;
+}
+}
+
+paddle_error paddle_gradient_machine_randomize_param(
+    paddle_gradient_machine machine) {
+  auto m = cast(machine);
+  if (m == nullptr || m->machine == nullptr) return kPD_NULLPTR;
+  m->machine->randParameters();
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_get_layer_output(
+    paddle_gradient_machine machine,
+    const char* layerName,
+    paddle_arguments args) {
+  auto m = cast(machine);
+  auto out = paddle::capi::cast<paddle::capi::CArguments>(args);
+  if (m == nullptr || layerName == nullptr || out == nullptr ||
+      m->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+
+  auto layerOutput = m->machine->getLayerOutput(layerName);
+  out->args.push_back(layerOutput);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_release_layer_output(
+    paddle_gradient_machine machine) {
+  auto m = cast(machine);
+  if (m == nullptr || m->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+  m->machine->releaseOutput();
+  return kPD_NO_ERROR;
+}
diff --git a/paddle/capi/gradient_machine.h b/paddle/legacy/capi/gradient_machine.h
similarity index 100%
rename from paddle/capi/gradient_machine.h
rename to paddle/legacy/capi/gradient_machine.h
diff --git a/paddle/capi/main.h b/paddle/legacy/capi/main.h
similarity index 100%
rename from paddle/capi/main.h
rename to paddle/legacy/capi/main.h
diff --git a/paddle/capi/matrix.h b/paddle/legacy/capi/matrix.h
similarity index 100%
rename from paddle/capi/matrix.h
rename to paddle/legacy/capi/matrix.h
diff --git a/paddle/capi/paddle_capi.map b/paddle/legacy/capi/paddle_capi.map
similarity index 100%
rename from paddle/capi/paddle_capi.map
rename to paddle/legacy/capi/paddle_capi.map
diff --git a/paddle/capi/tests/.gitignore b/paddle/legacy/capi/tests/.gitignore
similarity index 100%
rename from paddle/capi/tests/.gitignore
rename to paddle/legacy/capi/tests/.gitignore
diff --git a/paddle/capi/tests/CMakeLists.txt b/paddle/legacy/capi/tests/CMakeLists.txt
similarity index 100%
rename from paddle/capi/tests/CMakeLists.txt
rename to paddle/legacy/capi/tests/CMakeLists.txt
diff --git a/paddle/capi/tests/test_Arguments.cpp b/paddle/legacy/capi/tests/test_Arguments.cpp
similarity index 100%
rename from paddle/capi/tests/test_Arguments.cpp
rename to paddle/legacy/capi/tests/test_Arguments.cpp
diff --git a/paddle/legacy/capi/tests/test_GradientMachine.cpp b/paddle/legacy/capi/tests/test_GradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c02669ccfa7550971e89c0dfad73e19368da527
--- /dev/null
+++ b/paddle/legacy/capi/tests/test_GradientMachine.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/gserver/gradientmachines/GradientMachine.h>
+#include <paddle/trainer/TrainerConfigHelper.h>
+#include <stdlib.h>
+#include <string.h>
+#include <type_traits>
+#include "capi.h"
+#include "paddle/utils/ThreadLocal.h"
+
+static std::vector<paddle_real> randomBuffer(size_t bufSize) {
+  auto& eng = paddle::ThreadLocalRandomEngine::get();
+  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
+  std::vector<paddle_real> retv;
+  retv.reserve(bufSize);
+  for (size_t i = 0; i < bufSize; ++i) {
+    retv.push_back(dist(eng));
+  }
+  return retv;
+}
+
+TEST(GradientMachine, testPredict) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle::TrainerConfigHelper config("./test_predict_network.py");
+  std::string buffer;
+  ASSERT_TRUE(config.getModelConfig().SerializeToString(&buffer));
+  paddle_gradient_machine machine;
+
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_create_for_inference(
+                &machine, &buffer[0], (int)buffer.size()));
+  std::unique_ptr<paddle::GradientMachine> gm(
+      paddle::GradientMachine::create(config.getModelConfig()));
+  ASSERT_NE(nullptr, gm);
+  gm->randParameters();
+  gm->saveParameters("./");
+
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_load_parameter_from_disk(machine, "./"));
+
+  paddle_gradient_machine machineSlave;
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_create_shared_param(
+                machine, &buffer[0], (int)buffer.size(), &machineSlave));
+  std::swap(machineSlave, machine);
+  paddle_arguments outArgs = paddle_arguments_create_none();
+
+  paddle_arguments inArgs = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(inArgs, 1));
+  paddle_matrix mat = paddle_matrix_create(1, 100, false);
+  static_assert(std::is_same<paddle_real, paddle::real>::value, "");
+
+  auto data = randomBuffer(100);
+  paddle_real* rowPtr;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
+  memcpy(rowPtr, data.data(), data.size() * sizeof(paddle_real));
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(inArgs, 0, mat));
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_forward(machine, inArgs, outArgs, false));
+
+  uint64_t sz;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(outArgs, &sz));
+  ASSERT_EQ(1UL, sz);
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(outArgs, 0, mat));
+  std::vector<paddle::Argument> paddleInArgs;
+  std::vector<paddle::Argument> paddleOutArgs;
+  paddleInArgs.resize(1);
+  paddleInArgs[0].value =
+      paddle::Matrix::create(data.data(), 1, 100, false, false);
+
+  gm->forward(paddleInArgs, &paddleOutArgs, paddle::PASS_TEST);
+
+  auto matPaddle = paddleOutArgs[0].value;
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(matPaddle->getHeight(), height);
+  ASSERT_EQ(matPaddle->getWidth(), width);
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
+  for (size_t i = 0; i < width; ++i) {
+    ASSERT_NEAR(matPaddle->getData()[i], rowPtr[i], 1e-5);
+  }
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(inArgs));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(outArgs));
+  std::swap(machineSlave, machine);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machineSlave));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machine));
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  std::vector<char*> argvs;
+  argvs.push_back(strdup("--use_gpu=false"));
+  paddle_init((int)argvs.size(), argvs.data());
+  for (auto each : argvs) {
+    free(each);
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/legacy/capi/tests/test_Matrix.cpp
similarity index 100%
rename from paddle/capi/tests/test_Matrix.cpp
rename to paddle/legacy/capi/tests/test_Matrix.cpp
diff --git a/paddle/capi/tests/test_Vector.cpp b/paddle/legacy/capi/tests/test_Vector.cpp
similarity index 100%
rename from paddle/capi/tests/test_Vector.cpp
rename to paddle/legacy/capi/tests/test_Vector.cpp
diff --git a/paddle/capi/tests/test_predict_network.py b/paddle/legacy/capi/tests/test_predict_network.py
similarity index 100%
rename from paddle/capi/tests/test_predict_network.py
rename to paddle/legacy/capi/tests/test_predict_network.py
diff --git a/paddle/capi/vector.h b/paddle/legacy/capi/vector.h
similarity index 100%
rename from paddle/capi/vector.h
rename to paddle/legacy/capi/vector.h
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/legacy/cuda/CMakeLists.txt
similarity index 100%
rename from paddle/cuda/CMakeLists.txt
rename to paddle/legacy/cuda/CMakeLists.txt
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/legacy/cuda/include/hl_activation_functions.h
similarity index 100%
rename from paddle/cuda/include/hl_activation_functions.h
rename to paddle/legacy/cuda/include/hl_activation_functions.h
diff --git a/paddle/cuda/include/hl_aggregate.h b/paddle/legacy/cuda/include/hl_aggregate.h
similarity index 100%
rename from paddle/cuda/include/hl_aggregate.h
rename to paddle/legacy/cuda/include/hl_aggregate.h
diff --git a/paddle/cuda/include/hl_avx_functions.h b/paddle/legacy/cuda/include/hl_avx_functions.h
similarity index 100%
rename from paddle/cuda/include/hl_avx_functions.h
rename to paddle/legacy/cuda/include/hl_avx_functions.h
diff --git a/paddle/legacy/cuda/include/hl_base.h b/paddle/legacy/cuda/include/hl_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..8451d2546d47141f3bc8505d11ce19287286747f
--- /dev/null
+++ b/paddle/legacy/cuda/include/hl_base.h
@@ -0,0 +1,250 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstddef>
+
+#ifdef PADDLE_TYPE_DOUBLE
+#define HL_FLOAT_MAX 3.40282347e+38F
+#define HL_FLOAT_MIN 1.17549435e-38F
+using real = double;
+#else
+#define HL_FLOAT_MAX 1.7976931348623157e+308
+#define HL_FLOAT_MIN 2.2250738585072014e-308
+using real = float;
+#endif
+
+/**
+ * The maximum input value for exp, used to avoid overflow problem.
+ * currently only used for tanh function.
+ */
+#define EXP_MAX_INPUT 40.0
+
+/**
+ * @brief DIVUP(x, y) is similar to ceil(x / y).
+ * @note  For CUDA, DIVUP will be used to specify
+ *        the size of blockDim.
+ */
+#ifndef DIVUP
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
+#endif
+
+/**
+ * HPPL is an internal high performance parallel computing library
+ * for high-level neural network routines, which can support many
+ * heterogeneous compute architectures, such as GPU, FPGA, etc.
+ */
+
+/**
+ * @brief   HPPL CUDA Stream.
+ *
+ * @note    Each thread can use HPPL_STREAM_* after calling hl_init.
+ *          HPPL_STREAM_DEFAULT is HPPL default stream.
+ */
+typedef enum {
+  HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
+  HPPL_STREAM_1 = 1,
+  HPPL_STREAM_2 = 2,
+  HPPL_STREAM_3 = 3,
+  HPPL_STREAM_4 = 4,
+  HPPL_THREAD_STREAM_1 = 5,
+  HPPL_THREAD_STREAM_2 = 6,
+  HPPL_THREAD_STREAM_3 = 7,
+  HPPL_THREAD_STREAM_4 = 8,
+  HPPL_STREAM_END
+} hl_stream_t;
+
+/**
+ * @brief HPPL activation mode.
+ */
+typedef enum {
+  HL_ACTIVATION_SIGMOID = 0,
+  HL_ACTIVATION_RELU = 1,
+  HL_ACTIVATION_TANH = 2,
+  HL_ACTIVATION_LINEAR = 3,
+  HL_ACTIVATION_END
+} hl_activation_mode_t;
+
+/**
+ * @brief Transpose type.
+ */
+typedef enum {
+  HPPL_OP_N = 0, /* transpose */
+  HPPL_OP_T = 1, /* non transpose */
+  HPPL_OP_END
+} hl_trans_op_t;
+
+/**
+ * @brief Lstm value.
+ *
+ * @param  gateValue         input value.
+ * @param  prevStateValue    previous state value.
+ * @param  stateValue        state value.
+ * @param  stateActiveValue  state active value.
+ * @param  outputValue       output value.
+ */
+typedef struct {
+  real *gateValue;
+  real *prevStateValue;
+  real *stateValue;
+  real *stateActiveValue;
+  real *outputValue;
+  real *checkIg;
+  real *checkFg;
+  real *checkOg;
+} hl_lstm_value;
+
+/**
+ * @brief Lstm gradient.
+ *
+ * @param  gateGrad          input gradient.
+ * @param  prevStateGrad     previous state gradient.
+ * @param  stateGrad         state gradient.
+ * @param  stateActiveGrad   state active gradient.
+ * @param  outputGrad        output gradient.
+ */
+typedef struct {
+  real *gateGrad;
+  real *prevStateGrad;
+  real *stateGrad;
+  real *stateActiveGrad;
+  real *outputGrad;
+  real *checkIgGrad;
+  real *checkFgGrad;
+  real *checkOgGrad;
+} hl_lstm_grad;
+
+/**
+ * @brief Gru value.
+ *
+ * @param  gateWeight           gate weight (updateGate + resetGate).
+ * @param  stateWeight          frame state weight.
+ * @param  gateValue            gate value results.
+ * @param  resetOutputValue     resetOutput value.
+ * @param  outputValue          output value.
+ * @param  prevOutValue         previous output value.
+ *
+ */
+typedef struct {
+  real *gateWeight;
+  real *stateWeight;
+  real *gateValue;
+  real *resetOutputValue;
+  real *outputValue;
+  real *prevOutValue;
+} hl_gru_value;
+
+/**
+ * @brief Gru gradient.
+ *
+ * @param  gateWeightGrad       gate weight gradient.
+ * @param  stateWeightGrad      frame state weight gradient.
+ * @param  gateGrad             gate gradient results.
+ * @param  resetOutputGrad      resetOutput gradient.
+ * @param  outputGrad           output gradient.
+ * @param  prevOutGrad          previous output gradient.
+ */
+typedef struct {
+  real *gateWeightGrad;
+  real *stateWeightGrad;
+  real *gateGrad;
+  real *resetOutputGrad;
+  real *outputGrad;
+  real *prevOutGrad;
+} hl_gru_grad;
+
+/**
+ * @brief  Sparse matrix value type.
+ */
+typedef enum {
+  HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
+  HL_FLOAT_VALUE = 1,
+  HL_VALUE_END
+} hl_matrix_value_t;
+
+/**
+ * @brief  HPPL matrix format.
+ */
+typedef enum {
+  HL_SPARSE_CSR = 0,
+  HL_SPARSE_CSC = 1,
+  HL_SPARSE_END
+} hl_matrix_format_t;
+
+typedef struct _hl_matrix_s *hl_matrix_s;
+
+/**
+ * @brief   HPPL sparse matrix.
+ *
+ * @param  matrix     sparse matrix.
+ * @param  format     matrix format.
+ * @param  type       the type of matrix values.
+ * @param  rows       matrix rows.
+ * @param  cols       matrix columns.
+ * @param  nnz        nonzero values of sparse matrix.
+ */
+typedef struct {
+  hl_matrix_s matrix;
+  hl_matrix_format_t format;
+  hl_matrix_value_t type;
+  int rows;
+  int cols;
+  size_t nnz;
+} _hl_sparse_matrix_s, *hl_sparse_matrix_s;
+
+#ifdef __NVCC__
+
+#include <cuda_runtime.h>
+#include "paddle/legacy/cuda/include/hl_cuda.h"
+#include "paddle/utils/Logging.h"
+
+extern __thread bool g_sync_flag;
+extern __thread cudaStream_t default_stream;
+#define STREAM_DEFAULT default_stream
+
+/**
+ * @brief   Check cuda kernel execution.
+ * @param   msg   error string
+ */
+#define CHECK_SYNC(msg)                                               \
+  if (true == g_sync_flag) {                                          \
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);                       \
+    cudaError_t err = (cudaError_t)hl_get_device_last_error();        \
+    CHECK_EQ(cudaSuccess, err)                                        \
+        << "[" << msg << "] "                                         \
+        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
+  }
+
+// __shfl has been deprecated as of CUDA 9.0.
+#if CUDA_VERSION < 9000
+template <typename T>
+__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
+  return __shfl_down(val, delta);
+}
+
+template <typename T>
+__forceinline__ __device__ T
+__shfl_sync(unsigned, T val, int src_line, int width) {
+  return __shfl(val, src_line, width);
+}
+
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
+#endif  // __NVCC__
diff --git a/paddle/cuda/include/hl_batch_norm.h b/paddle/legacy/cuda/include/hl_batch_norm.h
similarity index 100%
rename from paddle/cuda/include/hl_batch_norm.h
rename to paddle/legacy/cuda/include/hl_batch_norm.h
diff --git a/paddle/cuda/include/hl_batch_transpose.h b/paddle/legacy/cuda/include/hl_batch_transpose.h
similarity index 100%
rename from paddle/cuda/include/hl_batch_transpose.h
rename to paddle/legacy/cuda/include/hl_batch_transpose.h
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/legacy/cuda/include/hl_cnn.h
similarity index 100%
rename from paddle/cuda/include/hl_cnn.h
rename to paddle/legacy/cuda/include/hl_cnn.h
diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/legacy/cuda/include/hl_cpu_gru.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_gru.cuh
rename to paddle/legacy/cuda/include/hl_cpu_gru.cuh
diff --git a/paddle/cuda/include/hl_cpu_lstm.cuh b/paddle/legacy/cuda/include/hl_cpu_lstm.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_lstm.cuh
rename to paddle/legacy/cuda/include/hl_cpu_lstm.cuh
diff --git a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh b/paddle/legacy/cuda/include/hl_cpu_matrix_kernel.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_matrix_kernel.cuh
rename to paddle/legacy/cuda/include/hl_cpu_matrix_kernel.cuh
diff --git a/paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh b/paddle/legacy/cuda/include/hl_cpu_matrix_kernel_detail.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh
rename to paddle/legacy/cuda/include/hl_cpu_matrix_kernel_detail.cuh
diff --git a/paddle/cuda/include/hl_cpu_scalar.cuh b/paddle/legacy/cuda/include/hl_cpu_scalar.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_scalar.cuh
rename to paddle/legacy/cuda/include/hl_cpu_scalar.cuh
diff --git a/paddle/cuda/include/hl_cpu_simd_neon.cuh b/paddle/legacy/cuda/include/hl_cpu_simd_neon.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_simd_neon.cuh
rename to paddle/legacy/cuda/include/hl_cpu_simd_neon.cuh
diff --git a/paddle/cuda/include/hl_cpu_simd_sse.cuh b/paddle/legacy/cuda/include/hl_cpu_simd_sse.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_simd_sse.cuh
rename to paddle/legacy/cuda/include/hl_cpu_simd_sse.cuh
diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/legacy/cuda/include/hl_cuda.h
similarity index 100%
rename from paddle/cuda/include/hl_cuda.h
rename to paddle/legacy/cuda/include/hl_cuda.h
diff --git a/paddle/cuda/include/hl_cuda.ph b/paddle/legacy/cuda/include/hl_cuda.ph
similarity index 100%
rename from paddle/cuda/include/hl_cuda.ph
rename to paddle/legacy/cuda/include/hl_cuda.ph
diff --git a/paddle/cuda/include/hl_cuda_cublas.h b/paddle/legacy/cuda/include/hl_cuda_cublas.h
similarity index 100%
rename from paddle/cuda/include/hl_cuda_cublas.h
rename to paddle/legacy/cuda/include/hl_cuda_cublas.h
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/legacy/cuda/include/hl_cuda_cudnn.h
similarity index 100%
rename from paddle/cuda/include/hl_cuda_cudnn.h
rename to paddle/legacy/cuda/include/hl_cuda_cudnn.h
diff --git a/paddle/cuda/include/hl_cuda_cudnn.ph b/paddle/legacy/cuda/include/hl_cuda_cudnn.ph
similarity index 100%
rename from paddle/cuda/include/hl_cuda_cudnn.ph
rename to paddle/legacy/cuda/include/hl_cuda_cudnn.ph
diff --git a/paddle/cuda/include/hl_device_functions.cuh b/paddle/legacy/cuda/include/hl_device_functions.cuh
similarity index 100%
rename from paddle/cuda/include/hl_device_functions.cuh
rename to paddle/legacy/cuda/include/hl_device_functions.cuh
diff --git a/paddle/cuda/include/hl_functions.h b/paddle/legacy/cuda/include/hl_functions.h
similarity index 100%
rename from paddle/cuda/include/hl_functions.h
rename to paddle/legacy/cuda/include/hl_functions.h
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/legacy/cuda/include/hl_gpu.h
similarity index 100%
rename from paddle/cuda/include/hl_gpu.h
rename to paddle/legacy/cuda/include/hl_gpu.h
diff --git a/paddle/cuda/include/hl_gpu_functions.cuh b/paddle/legacy/cuda/include/hl_gpu_functions.cuh
similarity index 100%
rename from paddle/cuda/include/hl_gpu_functions.cuh
rename to paddle/legacy/cuda/include/hl_gpu_functions.cuh
diff --git a/paddle/cuda/include/hl_gpu_gru.cuh b/paddle/legacy/cuda/include/hl_gpu_gru.cuh
similarity index 100%
rename from paddle/cuda/include/hl_gpu_gru.cuh
rename to paddle/legacy/cuda/include/hl_gpu_gru.cuh
diff --git a/paddle/cuda/include/hl_gpu_lstm.cuh b/paddle/legacy/cuda/include/hl_gpu_lstm.cuh
similarity index 100%
rename from paddle/cuda/include/hl_gpu_lstm.cuh
rename to paddle/legacy/cuda/include/hl_gpu_lstm.cuh
diff --git a/paddle/cuda/include/hl_gpu_matrix_kernel.cuh b/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
similarity index 100%
rename from paddle/cuda/include/hl_gpu_matrix_kernel.cuh
rename to paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
diff --git a/paddle/cuda/include/hl_gru_ops.cuh b/paddle/legacy/cuda/include/hl_gru_ops.cuh
similarity index 100%
rename from paddle/cuda/include/hl_gru_ops.cuh
rename to paddle/legacy/cuda/include/hl_gru_ops.cuh
diff --git a/paddle/cuda/include/hl_lstm.h b/paddle/legacy/cuda/include/hl_lstm.h
similarity index 100%
rename from paddle/cuda/include/hl_lstm.h
rename to paddle/legacy/cuda/include/hl_lstm.h
diff --git a/paddle/cuda/include/hl_lstm_ops.cuh b/paddle/legacy/cuda/include/hl_lstm_ops.cuh
similarity index 100%
rename from paddle/cuda/include/hl_lstm_ops.cuh
rename to paddle/legacy/cuda/include/hl_lstm_ops.cuh
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/legacy/cuda/include/hl_matrix.h
similarity index 100%
rename from paddle/cuda/include/hl_matrix.h
rename to paddle/legacy/cuda/include/hl_matrix.h
diff --git a/paddle/cuda/include/hl_matrix_apply.cuh b/paddle/legacy/cuda/include/hl_matrix_apply.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_apply.cuh
rename to paddle/legacy/cuda/include/hl_matrix_apply.cuh
diff --git a/paddle/cuda/include/hl_matrix_base.cuh b/paddle/legacy/cuda/include/hl_matrix_base.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_base.cuh
rename to paddle/legacy/cuda/include/hl_matrix_base.cuh
diff --git a/paddle/cuda/include/hl_matrix_base_detail.cuh b/paddle/legacy/cuda/include/hl_matrix_base_detail.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_base_detail.cuh
rename to paddle/legacy/cuda/include/hl_matrix_base_detail.cuh
diff --git a/paddle/cuda/include/hl_matrix_ops.cuh b/paddle/legacy/cuda/include/hl_matrix_ops.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_ops.cuh
rename to paddle/legacy/cuda/include/hl_matrix_ops.cuh
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/legacy/cuda/include/hl_matrix_type.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_type.cuh
rename to paddle/legacy/cuda/include/hl_matrix_type.cuh
diff --git a/paddle/cuda/include/hl_perturbation_util.cuh b/paddle/legacy/cuda/include/hl_perturbation_util.cuh
similarity index 100%
rename from paddle/cuda/include/hl_perturbation_util.cuh
rename to paddle/legacy/cuda/include/hl_perturbation_util.cuh
diff --git a/paddle/cuda/include/hl_recurrent_apply.cuh b/paddle/legacy/cuda/include/hl_recurrent_apply.cuh
similarity index 100%
rename from paddle/cuda/include/hl_recurrent_apply.cuh
rename to paddle/legacy/cuda/include/hl_recurrent_apply.cuh
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/legacy/cuda/include/hl_sequence.h
similarity index 100%
rename from paddle/cuda/include/hl_sequence.h
rename to paddle/legacy/cuda/include/hl_sequence.h
diff --git a/paddle/cuda/include/hl_sparse.h b/paddle/legacy/cuda/include/hl_sparse.h
similarity index 100%
rename from paddle/cuda/include/hl_sparse.h
rename to paddle/legacy/cuda/include/hl_sparse.h
diff --git a/paddle/cuda/include/hl_sparse.ph b/paddle/legacy/cuda/include/hl_sparse.ph
similarity index 100%
rename from paddle/cuda/include/hl_sparse.ph
rename to paddle/legacy/cuda/include/hl_sparse.ph
diff --git a/paddle/cuda/include/hl_table_apply.h b/paddle/legacy/cuda/include/hl_table_apply.h
similarity index 100%
rename from paddle/cuda/include/hl_table_apply.h
rename to paddle/legacy/cuda/include/hl_table_apply.h
diff --git a/paddle/cuda/include/hl_tensor_ops.h b/paddle/legacy/cuda/include/hl_tensor_ops.h
similarity index 100%
rename from paddle/cuda/include/hl_tensor_ops.h
rename to paddle/legacy/cuda/include/hl_tensor_ops.h
diff --git a/paddle/cuda/include/hl_thread.ph b/paddle/legacy/cuda/include/hl_thread.ph
similarity index 100%
rename from paddle/cuda/include/hl_thread.ph
rename to paddle/legacy/cuda/include/hl_thread.ph
diff --git a/paddle/cuda/include/hl_time.h b/paddle/legacy/cuda/include/hl_time.h
similarity index 100%
rename from paddle/cuda/include/hl_time.h
rename to paddle/legacy/cuda/include/hl_time.h
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/legacy/cuda/include/hl_top_k.h
similarity index 100%
rename from paddle/cuda/include/hl_top_k.h
rename to paddle/legacy/cuda/include/hl_top_k.h
diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/legacy/cuda/include/hl_warpctc_wrap.h
similarity index 100%
rename from paddle/cuda/include/hl_warpctc_wrap.h
rename to paddle/legacy/cuda/include/hl_warpctc_wrap.h
diff --git a/paddle/cuda/include/stub/hl_aggregate_stub.h b/paddle/legacy/cuda/include/stub/hl_aggregate_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_aggregate_stub.h
rename to paddle/legacy/cuda/include/stub/hl_aggregate_stub.h
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/legacy/cuda/include/stub/hl_cnn_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_cnn_stub.h
rename to paddle/legacy/cuda/include/stub/hl_cnn_stub.h
diff --git a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_cublas_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_cuda_cublas_stub.h
rename to paddle/legacy/cuda/include/stub/hl_cuda_cublas_stub.h
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_cudnn_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
rename to paddle/legacy/cuda/include/stub/hl_cuda_cudnn_stub.h
diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_cuda_stub.h
rename to paddle/legacy/cuda/include/stub/hl_cuda_stub.h
diff --git a/paddle/cuda/include/stub/hl_lstm_stub.h b/paddle/legacy/cuda/include/stub/hl_lstm_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_lstm_stub.h
rename to paddle/legacy/cuda/include/stub/hl_lstm_stub.h
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/legacy/cuda/include/stub/hl_matrix_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_matrix_stub.h
rename to paddle/legacy/cuda/include/stub/hl_matrix_stub.h
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/legacy/cuda/include/stub/hl_sequence_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_sequence_stub.h
rename to paddle/legacy/cuda/include/stub/hl_sequence_stub.h
diff --git a/paddle/cuda/include/stub/hl_sparse_stub.h b/paddle/legacy/cuda/include/stub/hl_sparse_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_sparse_stub.h
rename to paddle/legacy/cuda/include/stub/hl_sparse_stub.h
diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/legacy/cuda/src/avx_mathfun.h
similarity index 100%
rename from paddle/cuda/src/avx_mathfun.h
rename to paddle/legacy/cuda/src/avx_mathfun.h
diff --git a/paddle/cuda/src/hl_avx_functions.cc b/paddle/legacy/cuda/src/hl_avx_functions.cc
similarity index 100%
rename from paddle/cuda/src/hl_avx_functions.cc
rename to paddle/legacy/cuda/src/hl_avx_functions.cc
diff --git a/paddle/cuda/src/hl_batch_norm.cu b/paddle/legacy/cuda/src/hl_batch_norm.cu
similarity index 100%
rename from paddle/cuda/src/hl_batch_norm.cu
rename to paddle/legacy/cuda/src/hl_batch_norm.cu
diff --git a/paddle/cuda/src/hl_batch_transpose.cu b/paddle/legacy/cuda/src/hl_batch_transpose.cu
similarity index 100%
rename from paddle/cuda/src/hl_batch_transpose.cu
rename to paddle/legacy/cuda/src/hl_batch_transpose.cu
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/legacy/cuda/src/hl_cpu_functions.cc
similarity index 100%
rename from paddle/cuda/src/hl_cpu_functions.cc
rename to paddle/legacy/cuda/src/hl_cpu_functions.cc
diff --git a/paddle/cuda/src/hl_cuda_aggregate.cu b/paddle/legacy/cuda/src/hl_cuda_aggregate.cu
similarity index 100%
rename from paddle/cuda/src/hl_cuda_aggregate.cu
rename to paddle/legacy/cuda/src/hl_cuda_aggregate.cu
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/legacy/cuda/src/hl_cuda_cnn.cu
similarity index 100%
rename from paddle/cuda/src/hl_cuda_cnn.cu
rename to paddle/legacy/cuda/src/hl_cuda_cnn.cu
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/legacy/cuda/src/hl_cuda_cublas.cc
similarity index 100%
rename from paddle/cuda/src/hl_cuda_cublas.cc
rename to paddle/legacy/cuda/src/hl_cuda_cublas.cc
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/legacy/cuda/src/hl_cuda_cudnn.cc
similarity index 100%
rename from paddle/cuda/src/hl_cuda_cudnn.cc
rename to paddle/legacy/cuda/src/hl_cuda_cudnn.cc
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/legacy/cuda/src/hl_cuda_device.cc
similarity index 100%
rename from paddle/cuda/src/hl_cuda_device.cc
rename to paddle/legacy/cuda/src/hl_cuda_device.cc
diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/legacy/cuda/src/hl_cuda_lstm.cu
similarity index 100%
rename from paddle/cuda/src/hl_cuda_lstm.cu
rename to paddle/legacy/cuda/src/hl_cuda_lstm.cu
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/legacy/cuda/src/hl_cuda_matrix.cu
similarity index 100%
rename from paddle/cuda/src/hl_cuda_matrix.cu
rename to paddle/legacy/cuda/src/hl_cuda_matrix.cu
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/legacy/cuda/src/hl_cuda_sequence.cu
similarity index 100%
rename from paddle/cuda/src/hl_cuda_sequence.cu
rename to paddle/legacy/cuda/src/hl_cuda_sequence.cu
diff --git a/paddle/cuda/src/hl_cuda_sparse.cu b/paddle/legacy/cuda/src/hl_cuda_sparse.cu
similarity index 100%
rename from paddle/cuda/src/hl_cuda_sparse.cu
rename to paddle/legacy/cuda/src/hl_cuda_sparse.cu
diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/legacy/cuda/src/hl_cuda_sparse.cuh
similarity index 100%
rename from paddle/cuda/src/hl_cuda_sparse.cuh
rename to paddle/legacy/cuda/src/hl_cuda_sparse.cuh
diff --git a/paddle/cuda/src/hl_math.cc b/paddle/legacy/cuda/src/hl_math.cc
similarity index 100%
rename from paddle/cuda/src/hl_math.cc
rename to paddle/legacy/cuda/src/hl_math.cc
diff --git a/paddle/cuda/src/hl_perturbation_util.cu b/paddle/legacy/cuda/src/hl_perturbation_util.cu
similarity index 100%
rename from paddle/cuda/src/hl_perturbation_util.cu
rename to paddle/legacy/cuda/src/hl_perturbation_util.cu
diff --git a/paddle/cuda/src/hl_table_apply.cu b/paddle/legacy/cuda/src/hl_table_apply.cu
similarity index 100%
rename from paddle/cuda/src/hl_table_apply.cu
rename to paddle/legacy/cuda/src/hl_table_apply.cu
diff --git a/paddle/cuda/src/hl_time.cc b/paddle/legacy/cuda/src/hl_time.cc
similarity index 100%
rename from paddle/cuda/src/hl_time.cc
rename to paddle/legacy/cuda/src/hl_time.cc
diff --git a/paddle/legacy/cuda/src/hl_top_k.cu b/paddle/legacy/cuda/src/hl_top_k.cu
new file mode 100644
index 0000000000000000000000000000000000000000..14b9a7f50ffcb6f0159665693288630f0d556706
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_top_k.cu
@@ -0,0 +1,481 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/cuda/include/hl_base.h"
+#include "paddle/legacy/cuda/include/hl_sparse.ph"
+#include "paddle/legacy/cuda/include/hl_top_k.h"
+#include "paddle/utils/Logging.h"
+
+// using namespace hppl;
+
+struct Pair {
+  __device__ __forceinline__ Pair() {}
+
+  __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
+
+  __device__ __forceinline__ void set(real value, int id) {
+    v_ = value;
+    id_ = id;
+  }
+
+  __device__ __forceinline__ void operator=(const Pair& in) {
+    v_ = in.v_;
+    id_ = in.id_;
+  }
+
+  __device__ __forceinline__ bool operator<(const real value) const {
+    return (v_ < value);
+  }
+
+  __device__ __forceinline__ bool operator<(const Pair& in) const {
+    return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
+  }
+
+  __device__ __forceinline__ bool operator>(const Pair& in) const {
+    return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
+  }
+
+  real v_;
+  int id_;
+};
+
+__device__ __forceinline__ void addTo(Pair topK[],
+                                      const Pair& p,
+                                      int beamSize) {
+  for (int k = beamSize - 2; k >= 0; k--) {
+    if (topK[k] < p) {
+      topK[k + 1] = topK[k];
+    } else {
+      topK[k + 1] = p;
+      return;
+    }
+  }
+  topK[0] = p;
+}
+
+template <int beamSize>
+__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
+  for (int k = beamSize - 2; k >= 0; k--) {
+    if (topK[k] < p) {
+      topK[k + 1] = topK[k];
+    } else {
+      topK[k + 1] = p;
+      return;
+    }
+  }
+  topK[0] = p;
+}
+
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* src, int idx, int dim, int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < src[idx]) {
+      Pair tmp(src[idx], idx);
+      addTo(topK, tmp, beamSize);
+    }
+    idx += blockSize;
+  }
+}
+
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < src[idx]) {
+      Pair tmp(src[idx], idx);
+      if (tmp < max) {
+        addTo(topK, tmp, beamSize);
+      }
+    }
+    idx += blockSize;
+  }
+}
+
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < val[idx]) {
+      Pair tmp(val[idx], col[idx]);
+      addTo(topK, tmp, beamSize);
+    }
+    idx += blockSize;
+  }
+}
+
+template <int blockSize>
+__device__ __forceinline__ void getTopK(Pair topK[],
+                                        real* val,
+                                        int* col,
+                                        int idx,
+                                        int dim,
+                                        const Pair& max,
+                                        int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < val[idx]) {
+      Pair tmp(val[idx], col[idx]);
+      if (tmp < max) {
+        addTo(topK, tmp, beamSize);
+      }
+    }
+    idx += blockSize;
+  }
+}
+
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
+                                              int& beam,
+                                              int beamSize,
+                                              real* src,
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
+  if (beam > 0) {
+    int length = beam < beamSize ? beam : beamSize;
+    if (firstStep) {
+      firstStep = false;
+      getTopK<blockSize>(topK, src, tid, dim, length);
+    } else {
+      for (int k = 0; k < maxLength; k++) {
+        if (k < maxLength - beam) {
+          topK[k] = topK[k + beam];
+        } else {
+          topK[k].set(-HL_FLOAT_MAX, -1);
+        }
+      }
+      if (!isEmpty) {
+        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
+      }
+    }
+
+    max = topK[maxLength - 1];
+    if (max.id_ == -1) isEmpty = true;
+    beam = 0;
+  }
+}
+
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
+                                              int& beam,
+                                              int beamSize,
+                                              real* val,
+                                              int* col,
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
+  if (beam > 0) {
+    int length = beam < beamSize ? beam : beamSize;
+    if (firstStep) {
+      firstStep = false;
+      getTopK<blockSize>(topK, val, col, tid, dim, length);
+    } else {
+      for (int k = 0; k < maxLength; k++) {
+        if (k < maxLength - beam) {
+          topK[k] = topK[k + beam];
+        } else {
+          topK[k].set(-HL_FLOAT_MAX, -1);
+        }
+      }
+      if (!isEmpty) {
+        getTopK<blockSize>(
+            topK + maxLength - beam, val, col, tid, dim, max, length);
+      }
+    }
+
+    max = topK[maxLength - 1];
+    if (max.id_ == -1) isEmpty = true;
+    beam = 0;
+  }
+}
+
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void blockReduce(Pair* shTopK,
+                                            int* maxId,
+                                            Pair topK[],
+                                            real** topVal,
+                                            int** topIds,
+                                            int& beam,
+                                            int& beamSize,
+                                            const int tid,
+                                            const int warp) {
+  while (true) {
+    __syncthreads();
+    if (tid < blockSize / 2) {
+      if (shTopK[tid] < shTopK[tid + blockSize / 2]) {
+        maxId[tid] = tid + blockSize / 2;
+      } else {
+        maxId[tid] = tid;
+      }
+    }
+    __syncthreads();
+    for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
+      if (tid < stride) {
+        if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
+          maxId[tid] = maxId[tid + stride];
+        }
+      }
+      __syncthreads();
+    }
+    __syncthreads();
+
+    if (tid == 0) {
+      **topVal = shTopK[maxId[0]].v_;
+      **topIds = shTopK[maxId[0]].id_;
+      (*topVal)++;
+      (*topIds)++;
+    }
+    if (tid == maxId[0]) beam++;
+    if (--beamSize == 0) break;
+    __syncthreads();
+
+    // NOTE(zcd): temporary solution
+    unsigned mask = 0u;
+    CREATE_SHFL_MASK(mask, true);
+
+    if (tid == maxId[0]) {
+      if (beam < maxLength) {
+        shTopK[tid] = topK[beam];
+      }
+    }
+    if (maxId[0] / 32 == warp) {
+      if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
+    }
+  }
+}
+
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top maxLength value;
+ * 2. merge to shTopK, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topK value is null;
+ * 4. go to the first setp, until get the topK value.
+ */
+template <int maxLength, int blockSize>
+__global__ void KeMatrixTopK(real* topVal,
+                             int ldv,
+                             int* topIds,
+                             real* src,
+                             int lds,
+                             int dim,
+                             int beamSize) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  src += blockIdx.x * lds;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength];  // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+}
+
+template <int maxLength, int blockSize>
+__global__ void KeSMatrixTopK(real* topVal,
+                              int ldv,
+                              int* topIds,
+                              real* val,
+                              int* row,
+                              int* col,
+                              int beamSize) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength];  // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+
+  int start = row[blockIdx.x];
+  int end = row[blockIdx.x + 1];
+  int dim = end - start;
+  val += start;
+  col += start;
+
+  if (beamSize > dim) {
+    // if the number of values to sort are less than the output size,
+    // use -1 to indicate the end of valid sorted values.
+    if (tid == 0) {
+      topIds[dim] = -1;
+    }
+
+    beamSize = dim;
+  }
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+}
+
+void hl_matrix_top_k(real* topVal,
+                     int ldv,
+                     int* topIds,
+                     real* src,
+                     int lds,
+                     int dim,
+                     int beamSize,
+                     int numSamples) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+
+  if (beamSize > dim) beamSize = dim;
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, src, lds, dim, beamSize);
+
+  CHECK_SYNC("hl_matrix_top_k failed");
+}
+
+void hl_sparse_matrix_top_k(real* topVal,
+                            int ldv,
+                            int* topIds,
+                            hl_sparse_matrix_s src,
+                            int beamSize,
+                            int numSamples) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+  CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
+
+  hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
+  if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
+    LOG(FATAL) << "parameter src is null!";
+  }
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
+
+  CHECK_SYNC("hl_sparse_matrix_top_k failed");
+}
+
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top maxLength value;
+ * 2. merge to shTopK, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topK value is null;
+ * 4. go to the first setp, until get the topK value.
+ */
+template <int maxLength, int blockSize>
+__global__ void KeMatrixTopKClassificationError(real* topVal,
+                                                int ldv,
+                                                int* topIds,
+                                                real* src,
+                                                int lds,
+                                                int dim,
+                                                int beamSize,
+                                                int* label,
+                                                real* recResult) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  src += blockIdx.x * lds;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength];  // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+  int topkSize = beamSize;
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+
+  __syncthreads();
+  if (tid == 0) {
+    for (int i = 0; i < topkSize; i++) {
+      if (*--topIds == label[blockIdx.x]) {
+        recResult[blockIdx.x] = 0;
+        break;
+      }
+      recResult[blockIdx.x] = 1.0f;
+    }
+  }
+}
+
+void hl_matrix_classification_error(real* topVal,
+                                    int ldv,
+                                    int* topIds,
+                                    real* src,
+                                    int lds,
+                                    int dim,
+                                    int topkSize,
+                                    int numSamples,
+                                    int* label,
+                                    real* recResult) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+
+  if (topkSize > dim) topkSize = dim;
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
+
+  CHECK_SYNC("hl_matrix_top_k classification error failed");
+}
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/legacy/cuda/src/hl_warpctc_wrap.cc
similarity index 100%
rename from paddle/cuda/src/hl_warpctc_wrap.cc
rename to paddle/legacy/cuda/src/hl_warpctc_wrap.cc
diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/legacy/function/BlockExpandOp.cpp
similarity index 100%
rename from paddle/function/BlockExpandOp.cpp
rename to paddle/legacy/function/BlockExpandOp.cpp
diff --git a/paddle/function/BlockExpandOpTest.cpp b/paddle/legacy/function/BlockExpandOpTest.cpp
similarity index 100%
rename from paddle/function/BlockExpandOpTest.cpp
rename to paddle/legacy/function/BlockExpandOpTest.cpp
diff --git a/paddle/legacy/function/BufferArg.cpp b/paddle/legacy/function/BufferArg.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f3d505c31bf8d50503032a4baae6230b9f7241d
--- /dev/null
+++ b/paddle/legacy/function/BufferArg.cpp
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+
+#include "BufferArg.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+
+const SequenceArg& BufferArg::sequence() const {
+  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+  return dynamic_cast<const SequenceArg&>(*this);
+}
+
+const SparseMatrixArg& BufferArg::sparse() const {
+  CHECK_EQ(bufferType_, TENSOR_SPARSE);
+  return dynamic_cast<const SparseMatrixArg&>(*this);
+}
+
+SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
+      nnz_(sparse.getElementCnt()),
+      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
+      type_(static_cast<SparseDataType>(sparse.getValueType())) {
+  bufferType_ = TENSOR_SPARSE;
+}
+
+SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
+      nnz_(sparse.getElementCnt()),
+      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
+      type_(static_cast<SparseDataType>(sparse.getValueType())) {
+  bufferType_ = TENSOR_SPARSE;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/BufferArg.h b/paddle/legacy/function/BufferArg.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f47ad556d29363d784fde718fdacdf0658ef010
--- /dev/null
+++ b/paddle/legacy/function/BufferArg.h
@@ -0,0 +1,364 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include "TensorShape.h"
+#include "TensorType.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+enum BufferType {
+  TENSOR_UNKNOWN = 0,
+  TENSOR_NORMAL = 1,
+  TENSOR_SEQUENCE_ID = 2,
+  TENSOR_SEQUENCE_DATA = 3,
+  TENSOR_SPARSE = 4
+};
+
+class BufferArg;
+class SequenceArg;
+class SparseMatrixArg;
+
+/**
+ * \brief BufferArg used as the argument type of Function.
+ *
+ * The arguments of the Paddle Function have four Buffer types.
+ * 1. BufferArg for a dense Buffer of any dimension.
+ * 2. SequenceIdArg for a Buffer of sequence start positions.
+ * 3. SequenceArg for a Buffer of sequence data.
+ * 4. SparseMatrixArg for a Buffer of sparse matrix.
+ *
+ * Buffer shape
+ * For most buffers, the first dimension `shape()[0]` represents
+ * the size of the mini-batch.
+ *
+ * Buffer argType
+ * There is an ArgType property for the BufferArg used as Function Output.
+ * Whether the result of the Function calculation is assigned to the
+ * output Buffer or added to the output Buffer is determined by the
+ * argType_ property of the output BufferArg.
+ */
+
+// ArgType is only used by output BufferArg.
+// For input argument, argType_ is ignored.
+// For output argument, need to set the argType_ of the BufferArg.
+enum ArgType {
+  UNSPECIFIED = 0,
+  ASSIGN_TO = 1,
+  ADD_TO = 2,
+};
+class BufferArg {
+ public:
+  void setArgType(ArgType argType) { argType_ = argType; }
+
+  ArgType getArgType() const { return argType_; }
+
+ public:
+  BufferArg(ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(nullptr), valueType_(valueType), shape_(shape), argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(void* buf,
+            ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(void* buf, ValueType valueType) : buf_(buf), valueType_(valueType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(2),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, matrix.getHeight());
+    shape_.setDim(1, matrix.getWidth());
+  }
+
+  BufferArg(const Matrix& matrix,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(shape),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
+  }
+
+  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(1),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, vector.getSize());
+  }
+
+  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
+        valueType_(VALUE_TYPE_INT32),
+        shape_(1),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, vector.getSize());
+  }
+
+  template <DeviceType DType>
+  typename Tensor<real, DType>::Matrix matrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ((size_t)2, shape_.ndims());
+    return typename Tensor<real, DType>::Matrix(
+        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
+  }
+
+  template <typename VType, DeviceType DType>
+  typename Tensor<VType, DType>::Vector vector() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<VType>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ((size_t)1, shape_.ndims());
+    return typename Tensor<VType, DType>::Vector(
+        shape_[0], reinterpret_cast<VType*>(buf_));
+  }
+
+  virtual ~BufferArg() {}
+
+  template <typename T>
+  T* data() const {
+    return reinterpret_cast<T*>(buf_);
+  }
+
+  void* data() const { return buf_; }
+  ValueType valueType() const { return valueType_; }
+  BufferType bufferType() const { return bufferType_; }
+  const TensorShape& shape() const { return shape_; }
+  bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
+  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
+  virtual size_t numElements() const { return shape_.getElements(); }
+
+  const SequenceArg& sequence() const;
+  const SparseMatrixArg& sparse() const;
+
+ protected:
+  void* buf_;
+  ValueType valueType_;
+  TensorShape shape_;
+  BufferType bufferType_{TENSOR_UNKNOWN};
+  ArgType argType_{UNSPECIFIED};
+  // TODO(tianbing), add deviceType_
+  // leading dimensions. The size is dims_.size()
+  // Dims lds_;
+};
+
+// sequence start positions in a mini-batch of sequences
+// shape_.ndims() == 1
+// valueType_ = int32
+// if a < b then value_.buf_[a] < value_.buf_[b]
+class SequenceIdArg : public BufferArg {
+ public:
+  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
+      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    CHECK_EQ(shape_.ndims(), 1UL);
+    CHECK_GE(shape_[0], 1UL);
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  SequenceIdArg(void* buf,
+                const TensorShape& shape,
+                ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    CHECK_EQ(shape_.ndims(), 1UL);
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  ~SequenceIdArg() {}
+
+  size_t numSeqs() const { return numSeqs_; }
+
+ private:
+  size_t numSeqs_;
+};
+
+// sequences data
+// For mini-batch calculate,
+// one batch can contain more than one sequence of data.
+// SequenceArg can be used to represent sequences that contain multiple
+// unequal lengths.
+class SequenceArg : public BufferArg {
+ public:
+  SequenceArg(ValueType valueType,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType),
+        startPositions_(TensorShape({shape[0]})) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  SequenceArg(void* buf,
+              ValueType valueType,
+              const TensorShape& shape,
+              const SequenceIdArg& startPositions,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        startPositions_(startPositions) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  SequenceArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(matrix, argType), startPositions_(vector) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  ~SequenceArg() {}
+
+  void* getIdBuf() const { return startPositions_.data(); }
+  size_t numSeqs() const { return startPositions_.numSeqs(); }
+  SequenceIdArg& getSequenceId() { return startPositions_; }
+  const SequenceIdArg& getSequenceId() const { return startPositions_; }
+
+ private:
+  SequenceIdArg startPositions_;
+};
+
+// sparse matrix
+// valueType_ == float or double
+// shape_.ndims() == 2
+class SparseMatrixArg : public BufferArg {
+ public:
+  SparseMatrixArg(void* buf,
+                  ValueType valueType,
+                  const TensorShape& shape,
+                  const BufferArg& row,
+                  const BufferArg& col,
+                  size_t nnz,
+                  SparseFormat format,
+                  SparseValueType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        row_(row),
+        col_(col),
+        nnz_(nnz),
+        format_(static_cast<SparseDataFormat>(format)),
+        type_(static_cast<SparseDataType>(type)) {
+    bufferType_ = TENSOR_SPARSE;
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), 2UL);
+    CHECK_EQ(row_.shape().ndims(), 1UL);
+    CHECK_EQ(col_.shape().ndims(), 1UL);
+    if (format_ == T_SPARSE_CSR) {
+      CHECK_EQ(nnz, col.shape()[0]);
+    } else if (format_ == T_SPARSE_CSC) {
+      CHECK_EQ(nnz, row.shape()[0]);
+    }
+  }
+
+  SparseMatrixArg(ValueType valueType,
+                  const TensorShape& shape,
+                  size_t nnz,
+                  SparseFormat format,
+                  SparseValueType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType),
+        row_(BufferArg(nullptr, VALUE_TYPE_INT32)),
+        col_(BufferArg(nullptr, VALUE_TYPE_INT32)),
+        nnz_(nnz),
+        format_(static_cast<SparseDataFormat>(format)),
+        type_(static_cast<SparseDataType>(type)) {
+    bufferType_ = TENSOR_SPARSE;
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), 2UL);
+
+    /// len of row_ : height + 1 (CSR) or nnz (CSC), buf_ == nullptr
+    row_ = (format_ == T_SPARSE_CSR
+                ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[0] + 1})
+                : BufferArg(VALUE_TYPE_INT32, TensorShape{nnz}));
+    /// len of col_ :  width + 1 (CSC) or nnz (CSR), buf_ == nullptr
+    col_ = (format_ == T_SPARSE_CSR
+                ? BufferArg(VALUE_TYPE_INT32, TensorShape{nnz})
+                : BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[1] + 1}));
+  }
+
+  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+
+  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+
+  template <DeviceType DType>
+  typename Tensor<real, DType>::SparseMatrix SparseMatrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ(2UL, shape_.ndims());
+    return typename Tensor<real, DType>::SparseMatrix(
+        reinterpret_cast<real*>(buf_),
+        reinterpret_cast<int*>(row_.data()),
+        reinterpret_cast<int*>(col_.data()),
+        shape_[0],
+        shape_[1],
+        nnz_,
+        static_cast<SparseValueType>(type_),
+        static_cast<SparseFormat>(format_),
+        false);
+  }
+
+  ~SparseMatrixArg() {}
+
+  void* getRowBuf() const { return row_.data(); }
+
+  void* getColBuf() const { return col_.data(); }
+
+  size_t nnz() const { return nnz_; }
+
+  size_t numElements() const override { return nnz_; }
+
+  SparseDataFormat dataFormat() const { return format_; }
+
+  SparseDataType dataType() const { return type_; }
+
+ private:
+  BufferArg row_;
+  BufferArg col_;
+  size_t nnz_;
+  SparseDataFormat format_;
+  SparseDataType type_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/BufferArgTest.cpp b/paddle/legacy/function/BufferArgTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ec153bea89f25414b0df3088ab0c366c92ecbe0
--- /dev/null
+++ b/paddle/legacy/function/BufferArgTest.cpp
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BufferArg.h"
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/MemoryHandle.h"
+
+namespace paddle {
+
+TEST(BufferTest, BufferArg) {
+  TensorShape shape({8, 10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_FLOAT));
+  BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+}
+
+TEST(BufferTest, SequenceIdArg) {
+  TensorShape shape({10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_INT32));
+  SequenceIdArg buffer(memory.getBuf(), shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+  EXPECT_EQ(buffer.numSeqs(), 9U);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/legacy/function/CMakeLists.txt
similarity index 100%
rename from paddle/function/CMakeLists.txt
rename to paddle/legacy/function/CMakeLists.txt
diff --git a/paddle/legacy/function/ContextProjectionOp.cpp b/paddle/legacy/function/ContextProjectionOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05a3f915862b6657fc0a4300cbbea36721219e10
--- /dev/null
+++ b/paddle/legacy/function/ContextProjectionOp.cpp
@@ -0,0 +1,412 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ContextProjectionOp.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+/**
+ * Context Projection Forward with CPU Matrix Device.
+ *
+ */
+template <>
+void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
+                                               const CpuMatrix& input_mat,
+                                               const CpuMatrix& weight_mat,
+                                               const CpuIVector& seq_vec,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad) {
+  const int* starts = seq_vec.getData();
+  const size_t num_sequences = seq_vec.getSize() - 1;
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat).subMatrix(j, pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat)
+                  .subMatrix(begin_pad + context_start + j - pad_size,
+                             pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      MatrixPtr src =
+          const_cast<CpuMatrix&>(input_mat).subMatrix(begin, end - begin);
+      MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      dst->addAtOffset(*src, j * input_mat.getWidth());
+    }
+  }
+}
+
+/**
+ * Paddle Function for Context Projection Forward.
+ * Calculate the output layer value sequence after context projection.
+ *
+ * What is Context Projection for a sequence?
+ * For example, assumed input (x) has 4 words and the dimension of each word
+ * representation is 2. If we use zero to pad instead of learned weight to pad,
+ * and the context_lenth is 3, the output (y) is:
+ *
+ * @code
+ *  x = [a1, a2;
+ *       b1, b2;
+ *       c1, c2;
+ *       d1, d2]
+ *  y = [0,  0,  a1, a2, b1, b2;
+ *       a1, a2, b1, b2, c1, c2;
+ *       b1, b2, c1, c2, d1, d2;
+ *       c1, c2, d1, d2, 0,  0]
+ * @endcode
+ *
+ * \param outputs[0].matrix   output layer value, n * (d * l)
+ * \param outputs[0].vector   start position sequence, n * 1
+ * \param inputs[0].matrix    input layer value, n * d
+ * \param inputs[0].vector    start position sequence, n * 1
+ * \param inputs[1].matrix    input layer weight, pad * d
+ */
+template <DeviceType Device>
+class ContextProjectionForwardFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK(1UL == inputs.size() || 2UL == inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(val_seqs.shape().ndims(), 2UL);
+    /// dim of output = dim of input * context_length
+    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
+    /// input and output has the same batch_size
+    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
+    if (2UL == inputs.size()) {
+      CHECK_EQ(inputs[1].shape().ndims(), 2UL);
+      /// dim of input == dim of weight
+      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
+    }
+
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+    auto out_mat = out_seq.matrix<Device>();
+    const auto in_mat = val_seqs.matrix<Device>();
+    const auto w_mat =
+        (2UL == inputs.size() && inputs[1].data())
+            ? inputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
+
+    ContextProjectionForward<Device>(out_mat,
+                                     in_mat,
+                                     w_mat,
+                                     seq_vec,
+                                     context_length_,
+                                     context_start_,
+                                     begin_pad_);
+  }
+
+ private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+};
+
+/**
+ * Context Projection Backward with CPU Matrix Device.
+ *
+ */
+template <>
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
+                                                CpuMatrix& in_grad_mat,
+                                                CpuMatrix& w_grad_mat,
+                                                const CpuIVector& seq_vec,
+                                                size_t context_length,
+                                                int context_start,
+                                                size_t begin_pad,
+                                                bool is_padding,
+                                                size_t total_pad) {
+  size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
+                                 : w_grad_mat ? w_grad_mat.getWidth() : 0;
+  const int* starts = seq_vec.getData();
+  size_t num_sequences = seq_vec.getSize() - 1;
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i], pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(
+              begin_pad + context_start + j - pad_size, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      if (!in_grad_mat) continue;
+      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
+      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
+                          .subMatrix(dst_begin, dst_end - dst_begin);
+      src->addAtOffset(*dst, j * input_dim);
+    }
+  }
+}
+
+/**
+ * Context Projection Backward Function.
+ * Update the weight gradient and input layer gradient with backprop
+ *
+ * \param inputs[0].matrix          output layer grad, n * (d * l)
+ * \param inputs[0].vector          start position sequence, n * 1
+ * \param outputs[0].matrix         input layer grad, n * d
+ * \param outputs[0].vector         start position sequence, n * 1
+ * \param outputs[1]                weight grad, pad * d
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    is_padding_ = config.get<bool>("is_padding");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK(1UL == outputs.size() || 2UL == outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), 1UL);
+
+    /// input and output grad has the same batch_size
+    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
+    /// dim of output grad = dim of input grad * context_length
+    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+
+    if (2UL == outputs.size()) {
+      CHECK_EQ(outputs[1].shape().ndims(), 2UL);
+      /// dim of input grad == dim of weight
+      CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
+      CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+    }
+
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    auto in_grad_mat =
+        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                        : out_seq.matrix<Device>();
+    auto w_grad_mat =
+        (2UL == outputs.size() && outputs[1].data())
+            ? outputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+
+    ContextProjectionBackward<Device>(out_grad_mat,
+                                      in_grad_mat,
+                                      w_grad_mat,
+                                      seq_vec,
+                                      context_length_,
+                                      context_start_,
+                                      begin_pad_,
+                                      is_padding_,
+                                      total_pad_);
+  }
+
+ private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  bool is_padding_;
+  size_t total_pad_;
+};
+
+/**
+ * Context Projection Backward Data Function
+ * Update input layer grad
+ * input:  sequence of output layer grad
+ * output: sequence of input layer grad
+ *
+ * \param outputs[0].matrix              input layer grad, n * d
+ * \param outputs[0].vector              start position sequence, n * 1
+ * \param inputs[0].matrix               output layer grad, n * (d * l)
+ * \param inputs[0].vector               start positon sequence, n * 1
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardDataFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
+    /// output layer grad dim == input layer grad dim * context_length_
+    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
+    /// input and output has the same batch_size
+    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    auto in_grad_mat = out_seq.matrix<Device>();
+
+    ContextProjectionBackwardData<Device>(
+        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
+  }
+
+ private:
+  size_t context_length_;
+  int context_start_;
+};
+
+/**
+ * Context Projection Backward Weight Function
+ * Update weight grad by backprop
+ * input:  sequence of output layer grad
+ * output: weight grad
+ *
+ * \param outputs[0]                   weight grad, pad * d
+ * \param inputs[0].matrix             output layer grad, n * (d * l)
+ * \param inputs[0].vecotr             start positon sequence, n * 1
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardWeightFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
+    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
+    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
+    /// output layer grad dim == weight dim * context_length_
+    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    auto w_grad_mat = outputs[0].matrix<Device>();
+    ContextProjectionBackwardWeight<Device>(out_grad_mat,
+                                            w_grad_mat,
+                                            seq_vec,
+                                            context_length_,
+                                            context_start_,
+                                            total_pad_,
+                                            begin_pad_);
+  }
+
+ private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  size_t total_pad_;
+};
+
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    CPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    CPU,
+                    ContextProjectionBackwardFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    GPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    GPU,
+                    ContextProjectionBackwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
+                    GPU,
+                    ContextProjectionBackwardDataFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
+                    GPU,
+                    ContextProjectionBackwardWeightFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/legacy/function/ContextProjectionOp.h
similarity index 100%
rename from paddle/function/ContextProjectionOp.h
rename to paddle/legacy/function/ContextProjectionOp.h
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/legacy/function/ContextProjectionOpGpu.cu
similarity index 100%
rename from paddle/function/ContextProjectionOpGpu.cu
rename to paddle/legacy/function/ContextProjectionOpGpu.cu
diff --git a/paddle/legacy/function/ContextProjectionOpTest.cpp b/paddle/legacy/function/ContextProjectionOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3b0a34567fe17b466de6186e537243fe8166a77a
--- /dev/null
+++ b/paddle/legacy/function/ContextProjectionOpTest.cpp
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+void testMatrixProjectionForward(int context_start,
+                                 size_t context_length,
+                                 bool is_padding,
+                                 size_t batch_size,
+                                 size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  CpuGpuFuncCompare test(
+      "ContextProjectionForward",
+      FuncConfig()
+          .set("context_length", context_length)
+          .set("context_start", context_start)
+          .set("begin_pad", (size_t)std::max(0, -context_start)));
+
+  // prepare input arguments
+  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
+  test.addInputs(
+      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}));
+  if (is_padding) {  // weight
+    test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}));
+  }
+  test.addOutputs(
+      SequenceArg(VALUE_TYPE_FLOAT,
+                  TensorShape{batch_size, input_dim * context_length}),
+      ADD_TO);
+
+  // run Function
+  test.run();
+}
+
+void testMatrixProjectionBackward(int context_start,
+                                  size_t context_length,
+                                  bool is_padding,
+                                  size_t batch_size,
+                                  size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  CpuGpuFuncCompare test(
+      "ContextProjectionBackward",
+      FuncConfig()
+          .set("context_length", context_length)
+          .set("context_start", context_start)
+          .set("begin_pad", (size_t)std::max(0, -context_start))
+          .set("is_padding", is_padding)
+          .set("total_pad", pad));
+
+  // prepare input arguments
+  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
+  test.addInputs(SequenceArg(
+      VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim * context_length}));
+  test.addOutputs(
+      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}),
+      ADD_TO);
+  if (is_padding) {  // weight
+    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}),
+                    ADD_TO);
+  }
+
+  // run Function
+  test.run();
+}
+
+TEST(ContextProjection, Projection) {
+  for (auto context_start : {-5, -3, -1, 0, 3}) {
+    for (auto context_length : {1, 2, 5, 7}) {
+      for (auto trainable_padding : {false, true}) {
+        for (auto batch_size : {1, 2, 5, 20, 100}) {
+          for (auto input_dim : {15, 32, 63, 128, 200}) {
+            VLOG(3) << " context_start=" << context_start
+                    << " context_length=" << context_length
+                    << " trainable_padding=" << trainable_padding
+                    << " batch_size=" << batch_size
+                    << " input_dim=" << input_dim;
+            testMatrixProjectionForward(context_start,
+                                        context_length,
+                                        trainable_padding,
+                                        batch_size,
+                                        input_dim);
+            testMatrixProjectionBackward(context_start,
+                                         context_length,
+                                         trainable_padding,
+                                         batch_size,
+                                         input_dim);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/function/ConvOp.h b/paddle/legacy/function/ConvOp.h
similarity index 100%
rename from paddle/function/ConvOp.h
rename to paddle/legacy/function/ConvOp.h
diff --git a/paddle/function/ConvOpTest.h b/paddle/legacy/function/ConvOpTest.h
similarity index 100%
rename from paddle/function/ConvOpTest.h
rename to paddle/legacy/function/ConvOpTest.h
diff --git a/paddle/legacy/function/CosSimOp.cpp b/paddle/legacy/function/CosSimOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d04f4396caade803aa846fa81388f95a194845e6
--- /dev/null
+++ b/paddle/legacy/function/CosSimOp.cpp
@@ -0,0 +1,240 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CosSimOp.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+/**
+ * Cosine Similarity for CpuMatrix
+ *
+ * \param out_mat, output value, size: nSamples * 1.
+ * \param in1_mat, input value 1, size: nSamples * dim.
+ * \param in2_mat, input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param scale, default 1.0
+ *
+ */
+template <>
+void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
+                                    const CpuMatrix& in1_mat,
+                                    const CpuMatrix& in2_mat,
+                                    real scale) {
+  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
+  size_t num_samples = out_mat.getHeight();
+  size_t dim = in1_mat.getWidth();
+  /// column vector [nSamples, 1]
+  real* out = out_mat.getData();
+  const real* x = in1_mat.getData();
+  const real* y = in2_mat.getData();
+
+  /// in2 might only have one row or full rows
+  CHECK(in2_mat.getHeight() == 1LU || in2_mat.getHeight() == num_samples);
+  size_t inc = (in2_mat.getHeight() == 1LU) ? 0 : dim;
+  for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
+    real square_sum_x = 0;
+    real square_sum_y = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      square_sum_x += x[j] * x[j];
+      square_sum_y += y[j] * y[j];
+      xy += x[j] * y[j];
+    }
+    CHECK(square_sum_x > 0 && square_sum_y > 0);
+    out[i] = scale * xy / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
+  }
+}
+
+/**
+ * Cosine Similarity
+ * for each row i,
+ *   out[i] = scale * cos(input1[i], input2[i])
+ *      = scale * <input1[i], input2[i]>/sqrt(|input1[i]|^2 * |input2[i]|^2)
+ * when input2 only has one row, then for each row i,
+ *   out[i] = cos(input1[i], input2[0])
+ *
+ * \param inputs[0] input matrix 1, size: nSamples * dim.
+ * \param inputs[1] input matrix 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param outputs[0] output matrix, size : nSamples * 1.
+ */
+
+template <DeviceType Device>
+class CosSimForwardFunc : public FunctionBase {
+  void init(const FuncConfig& config) override {
+    scale_ = config.get<real>("scale");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(inputs.size(), 2UL);
+    CHECK_EQ(outputs.size(), 1UL);
+
+    CHECK_EQ(inputs[0].shape().ndims(), 2UL);
+    CHECK_EQ(inputs[1].shape().ndims(), 2UL);
+    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
+
+    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
+    CHECK_EQ(outputs[0].shape()[1], 1UL);
+
+    CHECK(outputs[0].data() && inputs[0].data() && inputs[1].data());
+
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    auto out_mat = outputs[0].matrix<Device>();
+    const auto in1_mat = inputs[0].matrix<Device>();
+    const auto in2_mat = inputs[1].matrix<Device>();
+
+    CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
+  }
+
+ private:
+  real scale_;
+};
+
+/**
+ * Cosine Similarity Derivative for CpuMatrix
+ *
+ * \param in1_grad  forward input grad 1, size: nSamples * dim.
+ * \param in2_grad  forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param out_grad  backward loss output grad, size : nSamples * 1.
+ * \param out_val   forward output value, size: nSamples * 1.
+ * \param in1_val   forward input value 1, size: nSamples * dim.
+ * \param in2_val   forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param scale,    default 1.0
+ */
+template <>
+void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad,
+                                     const CpuMatrix& out_val,
+                                     const CpuMatrix& in1_val,
+                                     const CpuMatrix& in2_val,
+                                     CpuMatrix& in1_grad,
+                                     CpuMatrix& in2_grad,
+                                     real scale) {
+  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
+        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
+  CHECK_EQ(out_val.useGpu_, false) << "Matrix type are GPU, CPU required";
+
+  const real* grad = out_grad.getData();
+  const real* out = out_val.getData();
+  const real* prev_out_x = in1_val.getData();
+  const real* prev_out_y = in2_val.getData();
+  real* prev_grad_x = in1_grad.getData();
+  real* prev_grad_y = in2_grad.getData();
+
+  size_t num_samples = out_grad.getHeight();
+  size_t dim = in1_val.getWidth();
+  CHECK_EQ(in2_val.getHeight(), in2_grad.getHeight());
+  CHECK(in2_val.getHeight() == 1LU || in2_val.getHeight() == num_samples);
+  size_t inc = (in2_val.getHeight() == 1LU) ? 0 : dim;
+  for (size_t i = 0; i < num_samples; ++i,
+              prev_out_x += dim,
+              prev_out_y += inc,
+              prev_grad_x += dim,
+              prev_grad_y += inc) {
+    real square_sum_x = 0;
+    real square_sum_y = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      square_sum_x += prev_out_x[j] * prev_out_x[j];
+      square_sum_y += prev_out_y[j] * prev_out_y[j];
+      xy += prev_out_x[j] * prev_out_y[j];
+    }
+    CHECK(square_sum_x > 0 && square_sum_y > 0);
+    if (xy == 0) {
+      real reciprocal =
+          1.0f / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
+      for (size_t j = 0; j < dim; ++j) {
+        prev_grad_x[j] += scale * grad[i] * prev_out_y[j] * reciprocal;
+        prev_grad_y[j] += scale * grad[i] * prev_out_x[j] * reciprocal;
+      }
+    } else {
+      real reciprocal_xy = 1.0f / xy;
+      real reciprocal_square_sum_x = 1.0f / square_sum_x;
+      real reciprocal_square_sum_y = 1.0f / square_sum_y;
+      for (size_t j = 0; j < dim; ++j) {
+        prev_grad_x[j] +=
+            out[i] * grad[i] * (prev_out_y[j] * reciprocal_xy -
+                                prev_out_x[j] * reciprocal_square_sum_x);
+        prev_grad_y[j] +=
+            out[i] * grad[i] * (prev_out_x[j] * reciprocal_xy -
+                                prev_out_y[j] * reciprocal_square_sum_y);
+      }
+    }
+  }
+}
+
+/**
+ * Cosine Similarity backward Derivative
+ *
+ * \param outputs[0] forward input grad 1, size: nSamples * dim.
+ * \param outputs[1] forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param inputs[0] backward loss output grad, size : nSamples * 1.
+ * \param inputs[1] forward output value, size: nSamples * 1.
+ * \param inputs[2] forward input value 1, size: nSamples * dim.
+ * \param inputs[3] forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ */
+template <DeviceType Device>
+class CosSimBackwardFunc : public FunctionBase {
+  void init(const FuncConfig& config) override {
+    scale_ = config.get<real>("scale");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(inputs.size(), 4UL);
+    CHECK_EQ(outputs.size(), 2UL);
+    /// dim of out_grad and out_val == 1, column vector
+    CHECK_EQ(inputs[0].shape()[1], 1UL);
+    CHECK_EQ(inputs[1].shape()[1], 1UL);
+    /// nSamples of out_grad == out_val == in_val1 == in_grad1
+    CHECK_EQ(inputs[1].shape()[0], inputs[0].shape()[0]);
+    CHECK_EQ(inputs[0].shape()[0], inputs[0].shape()[0]);
+    CHECK_EQ(outputs[0].shape()[0], inputs[0].shape()[0]);
+    /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
+    CHECK_EQ(inputs[3].shape()[1], inputs[2].shape()[1]);
+    CHECK_EQ(outputs[0].shape()[1], inputs[2].shape()[1]);
+    CHECK_EQ(outputs[1].shape()[1], inputs[2].shape()[1]);
+
+    CHECK(inputs[0].data() && inputs[1].data() && inputs[2].data() &&
+          inputs[3].data() && outputs[0].data() && outputs[1].data());
+
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+
+    const auto out_grad = inputs[0].matrix<Device>();
+    const auto out_val = inputs[1].matrix<Device>();
+    const auto in1_val = inputs[2].matrix<Device>();
+    const auto in2_val = inputs[3].matrix<Device>();
+    auto in1_grad = outputs[0].matrix<Device>();
+    auto in2_grad = outputs[1].matrix<Device>();
+
+    CosSimBackward<Device>(
+        out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
+  }
+
+ private:
+  real scale_;
+};
+
+REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
+REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
+REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/CosSimOp.h b/paddle/legacy/function/CosSimOp.h
similarity index 100%
rename from paddle/function/CosSimOp.h
rename to paddle/legacy/function/CosSimOp.h
diff --git a/paddle/function/CosSimOpGpu.cu b/paddle/legacy/function/CosSimOpGpu.cu
similarity index 100%
rename from paddle/function/CosSimOpGpu.cu
rename to paddle/legacy/function/CosSimOpGpu.cu
diff --git a/paddle/legacy/function/CosSimOpTest.cpp b/paddle/legacy/function/CosSimOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..31bb43e1baa9a6d890d1b8fe2abf15a07a7094c6
--- /dev/null
+++ b/paddle/legacy/function/CosSimOpTest.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/legacy/math/Matrix.h"
+
+using namespace paddle;  // NOLINT
+
+void testCosSimForward(size_t height_x,
+                       size_t height_y,
+                       size_t width,
+                       real scale) {
+  CpuGpuFuncCompare test("CosSimForward", FuncConfig().set("scale", scale));
+  // prepare input arguments
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}),
+                  ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+void testCosSimBackward(size_t height_x,
+                        size_t height_y,
+                        size_t width,
+                        real scale) {
+  CpuGpuFuncCompare test("CosSimBackward", FuncConfig().set("scale", scale));
+  // prepare input arguments
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}),
+                  ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}),
+                  ADD_TO);
+  // run Function
+  test.run();
+}
+
+TEST(Matrix, cosSim) {
+  for (auto height_x : {10, 100, 1000}) {
+    for (auto height_y : {1, height_x}) {
+      for (auto width : {10, 100, 1000}) {
+        for (auto scale : {1.0, 2.0}) {
+          testCosSimForward(height_x, height_y, width, scale);
+          testCosSimBackward(height_x, height_y, width, scale);
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/legacy/function/CropOp.cpp b/paddle/legacy/function/CropOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e22678822f06a323d1e6c17dce63d44d143484a3
--- /dev/null
+++ b/paddle/legacy/function/CropOp.cpp
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CropOp.h"
+#include "paddle/legacy/function/TensorShape.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void Crop<DEVICE_TYPE_CPU>(real* outputs,
+                           const real* inputs,
+                           const TensorShape inShape,
+                           const TensorShape outShape,
+                           const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cCrop = crop_corner[1];
+  int hCrop = crop_corner[2];
+  int wCrop = crop_corner[3];
+
+  int num = inShape[0];
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < outC; c++) {
+      for (int h = 0; h < outH; h++) {
+        int outoff = ((n * outC + c) * outH + h) * outW;
+        int inoff = ((n * inC + c + cCrop) * inH + h + hCrop) * inW + wCrop;
+        memcpy(outputs + outoff, inputs + inoff, outW * sizeof(real));
+      }
+    }
+  }
+}
+
+template <>
+void CropGrad<DEVICE_TYPE_CPU>(const real* inGrad,
+                               real* outGrad,
+                               const TensorShape inShape,
+                               const TensorShape outShape,
+                               const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cCrop = crop_corner[1];
+  int hCrop = crop_corner[2];
+  int wCrop = crop_corner[3];
+
+  int num = outShape[0];
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int outoff = ((n * outC + c + cCrop) * outH + h + hCrop) * outW + wCrop;
+        int inoff = ((n * inC + c) * inH + h) * inW;
+        CpuVector inG = CpuVector(inW, const_cast<real*>(inGrad + inoff));
+        CpuVector outG = CpuVector(inW, outGrad + outoff);
+        outG += inG;
+      }
+    }
+  }
+}
+
+/**
+ * \brief Crop input according to the specify corner and shape.
+ *        The input and output is a 4D tensor. In CropFunc, we only
+ *        crop the 2nd to 4th dimension.
+ *
+ * Argument in this Function:
+ * \param pad_    A struct object contains the cropping corner and shape.
+ * \param inputs  A 4D tensor, only one input.
+ * \param outputs A 4D tensor, the output value after cropping.
+ *
+ * For example,
+ * Input(2,2,2,3) = [
+ *                    [ [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]] ],
+ *                    [ [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]] ]
+ *                  ] # the input shape is (2,2,2,3)
+ *
+ * pad_: if corner = (0,1,1) and crop_shape = (2,1,2)
+ * Output(2,2,1,2) = [
+ *                    [ [[4,5]],
+ *                      [[6,7]] ],
+ *                    [ [[8,7]],
+ *                      [[3,5]] ]
+ *                  ] # the input shape is (2,2,2,3)
+ */
+template <DeviceType Device>
+class CropFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    TensorShape inShape = inputs[0].shape();
+    TensorShape outShape = outputs[0].shape();
+
+    Crop<Device>(outputs[0].data<real>(),
+                 inputs[0].data<real>(),
+                 inShape,
+                 outShape,
+                 conf_);
+  }
+
+ private:
+  FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of cropping Function.
+ *
+ * Argument in this Function:
+ * \param crop_    The same meaning as it in CropFunc.
+ * \param inputs  The gradient with respect to the output value of CropFunc.
+ * \param outputs The gradient with respect to the input value of CropFunc.
+ */
+
+template <DeviceType Device>
+class CropGradFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    TensorShape outShape = outputs[0].shape();
+    TensorShape inShape = inputs[0].shape();
+
+    CropGrad<Device>(inputs[0].data<real>(),
+                     outputs[0].data<real>(),
+                     inShape,
+                     outShape,
+                     conf_);
+  }
+
+ private:
+  FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
+REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
+REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/CropOp.h b/paddle/legacy/function/CropOp.h
similarity index 100%
rename from paddle/function/CropOp.h
rename to paddle/legacy/function/CropOp.h
diff --git a/paddle/function/CropOpGpu.cu b/paddle/legacy/function/CropOpGpu.cu
similarity index 100%
rename from paddle/function/CropOpGpu.cu
rename to paddle/legacy/function/CropOpGpu.cu
diff --git a/paddle/function/CropOpTest.cpp b/paddle/legacy/function/CropOpTest.cpp
similarity index 100%
rename from paddle/function/CropOpTest.cpp
rename to paddle/legacy/function/CropOpTest.cpp
diff --git a/paddle/legacy/function/CrossMapNormalOp.cpp b/paddle/legacy/function/CrossMapNormalOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f28703af00fa4bd7bebd98839cb077798083b61f
--- /dev/null
+++ b/paddle/legacy/function/CrossMapNormalOp.cpp
@@ -0,0 +1,344 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CrossMapNormalOp.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void CrossMapNormal<DEVICE_TYPE_CPU>(real* outputs,
+                                     real* denoms,
+                                     const real* inputs,
+                                     size_t numSamples,
+                                     size_t channels,
+                                     size_t height,
+                                     size_t width,
+                                     size_t size,
+                                     real scale,
+                                     real pow) {
+  size_t oneImage = height * width;
+  size_t oneSample = channels * oneImage;
+
+  CpuVector outputsV(numSamples * oneSample, outputs);
+  CpuVector inputsV(numSamples * oneSample, const_cast<real*>(inputs));
+  CpuVector denomsV(numSamples * oneSample, denoms);
+
+  // f(x) = x * ( 1 + scale * SUM((x)^2) )^(-pow)
+  // x represents inputs
+  // f(x) represents outputs
+  // denoms save the intermediate result for backward
+  denomsV = denomsV.constant(1.0);
+  const int start = -((int)size - 1) / 2;
+  const int end = (int)size + start;
+  for (size_t i = 0; i < numSamples; i++) {
+    real* oneDenom = denoms + i * oneSample;
+    real* oneInput = const_cast<real*>(inputs) + i * oneSample;
+    for (int c = 0; c < (int)channels; c++) {
+      CpuVector denom(oneImage, oneDenom + c * oneImage);
+      for (int s = start; s < end; s++) {
+        if (c + s >= 0 && c + s < (int)channels) {
+          CpuVector input(oneImage, oneInput + (c + s) * oneImage);
+          denom += input.square() * scale;
+        }
+      }
+    }
+  }
+
+  outputsV = inputsV * denomsV.pow(-pow);
+}
+
+template <>
+void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
+                                         const real* inputsValue,
+                                         const real* outputsValue,
+                                         const real* outputsGrad,
+                                         const real* denoms,
+                                         size_t numSamples,
+                                         size_t channels,
+                                         size_t height,
+                                         size_t width,
+                                         size_t size,
+                                         real scale,
+                                         real pow) {
+  size_t oneSample = channels * height * width;
+  std::function<CpuVector(real*, size_t)> oneImage = [=](real* data,
+                                                         size_t offset) {
+    return CpuVector(height * width, data + offset);
+  };
+
+  const int start = -((int)size) / 2;
+  const int end = (int)size + start;
+  const real ratio = -(real)2 * scale * pow;
+  for (size_t i = 0; i < numSamples; i++) {
+    size_t sOffset = i * oneSample;
+    real* oneInputGrad = inputsGrad + sOffset;
+    real* oneInputValue = const_cast<real*>(inputsValue) + sOffset;
+    real* oneDenom = const_cast<real*>(denoms) + sOffset;
+    real* oneOutputGrad = const_cast<real*>(outputsGrad) + sOffset;
+    real* oneOutputValue = const_cast<real*>(outputsValue) + sOffset;
+
+    for (int c = 0; c < (int)channels; c++) {
+      size_t cOffset = c * height * width;
+      CpuVector inputGrad = oneImage(oneInputGrad, cOffset);
+      CpuVector inputValue = oneImage(oneInputValue, cOffset);
+      CpuVector denom = oneImage(oneDenom, cOffset);
+      CpuVector outputGrad = oneImage(oneOutputGrad, cOffset);
+
+      inputGrad = inputGrad + denom.pow(-pow) * outputGrad;
+      for (int s = start; s < end; s++) {
+        if (c + s >= 0 && c + s < (int)channels) {
+          size_t offset = (c + s) * height * width;
+          CpuVector output = oneImage(oneOutputValue, offset);
+          CpuVector outputGrad = oneImage(oneOutputGrad, offset);
+          CpuVector denom = oneImage(oneDenom, offset);
+
+          inputGrad += ((outputGrad * output * ratio) / denom) * inputValue;
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief Normalization with across maps.
+ *
+ * This Function comes from the paper
+ * "ImageNet Classification with Deep Convolutional Neural Networks".
+ *
+ * The original formula is:
+ *
+ *                                Input(i, x, y)
+ * Output(i, x, y) = ----------------------------------------------
+ *                                 -- upper
+ *                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
+ *                                 -- j = lower
+ *
+ * upper is `min(C, c + N/2)`
+ * lower if `max(0, c - N/2)`
+ *
+ * Function implementation:
+ *
+ * inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
+ * And the meaning of each dimension(0-3) is respectively batch size,
+ * feature maps, rows and columns.
+ *
+ * Input and Output in the above formula is for each map(i) of one image, and
+ * Input(i, x, y), Output(i, x, y) represents an element in an image.
+ *
+ * C is the number of feature maps of one image, and N is a hyper-parameters
+ * is configured when Function is initialized. The sum in the denominator
+ * is the sum of the same position in the neighboring maps.
+ *
+ * In the implementation of Function, k is equal to 1,
+ * so Function has no argument for k.
+ *
+ * Function Arguments:
+ *
+ * \param size_      represent N
+ * \param scale_     represent alpha
+ * \param pow_       represent beta
+ * \param inputs[0]  represent Input
+ * \param outputs[0] represent Output
+ * \param outputs[1] represent The denominator in the formula(except beta)
+ *
+ * Note:
+ * Save output[1] is to simplify the backward calculation.
+ * TODO, if only consider the forward calculation, we can optimize to
+ * remove the output[1].
+ */
+template <DeviceType Device>
+class CrossMapNormalFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    size_ = config.get<size_t>("size");
+    scale_ = config.get<real>("scale");
+    pow_ = config.get<real>("pow");
+
+    // number of inputs and outputs
+    numInputs_ = 1;
+    numOutputs_ = 2;
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    check(inputs, outputs);
+    // ArgType check still on here,
+    // not sure whether it is better to put inside the check.
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    CrossMapNormal<Device>(outputs[0].data<real>(),
+                           outputs[1].data<real>(),
+                           inputs[0].data<real>(),
+                           batchSize,
+                           maps,
+                           rows,
+                           columns,
+                           size_,
+                           scale_,
+                           pow_);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+    CHECK(inputs[0].shape() == outputs[0].shape());
+    CHECK(inputs[0].shape() == outputs[1].shape());
+  }
+
+  // Only need the shape of the input, can calculate the
+  // floating-point operation.
+  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ((size_t)numInputs_, inputs.size());
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    // number of floating-point operations
+    // an approximate value
+    size_t ops = batchSize * maps * rows * columns * (size_ * 2 + 3);
+
+    return ops;
+  }
+
+ private:
+  size_t size_;
+  real scale_;
+  real pow_;
+};
+
+/**
+ * \brief Backward calculation for normalization with across maps.
+ *
+ * Function implementation:
+ *
+ * The implementation of this Function is derived from the
+ * CrossMapNormalFunc implementation.
+ *
+ * InputGrad = OutputGrad * denoms ^ (-beta)
+ *    -- upper
+ *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / denoms) * InputValue
+ *    -- lower
+ *
+ * The data of inputs/outputs format is the same as the forward interface
+ * and is NCHW.
+ *
+ * The upper and lower is the same as forward. The logic of the sum
+ * is also the same as forward.
+ *
+ * Function Arguments:
+ *
+ * \param size_      represent N
+ * \param scale_     represent alpha
+ * \param pow_       represent beta
+ * \param inputs[0]  represent InputValue, inputs[0] of CrossMapNormalFunc
+ * \param inputs[1]  represent OutputValue, outputs[0] of CrossMapNormalFunc
+ * \param inputs[2]  represent OutputGrad
+ * \param inputs[3]  represent denoms, outputs[1] of CrossMapNormalFunc
+ *                   This is the intermediate result that is
+ *                   preserved in the forward calculation.
+ * \param outputs[0] represent InputGrad
+ */
+template <DeviceType Device>
+class CrossMapNormalGradFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    size_ = config.get<size_t>("size");
+    scale_ = config.get<real>("scale");
+    pow_ = config.get<real>("pow");
+
+    // number of inputs and outputs
+    numInputs_ = 4;
+    numOutputs_ = 1;
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    check(inputs, outputs);
+    if (outputs[0].getArgType() != ADD_TO) {
+      // Currently, some algorithm implementations are ASSIGN_TO mode,
+      // if need to support the ADD_TO calculation, need to clear the output.
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
+
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
+                               inputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               inputs[2].data<real>(),
+                               inputs[3].data<real>(),
+                               batchSize,
+                               maps,
+                               rows,
+                               columns,
+                               size_,
+                               scale_,
+                               pow_);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+    CHECK(inputs[0].shape() == inputs[1].shape());
+    CHECK(inputs[0].shape() == inputs[2].shape());
+    CHECK(inputs[0].shape() == inputs[3].shape());
+    CHECK(inputs[0].shape() == outputs[0].shape());
+  }
+
+  // Only need the shape of one input, can calculate the
+  // floating-point operation.
+  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_LT((size_t)1, inputs.size());
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    // number of floating-point operations
+    // an approximate value
+    size_t ops = batchSize * maps * rows * columns * (size_ * 4 + 2);
+
+    return ops;
+  }
+
+ private:
+  size_t size_;
+  real scale_;
+  real pow_;
+};
+
+REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
+REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
+REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/CrossMapNormalOp.h b/paddle/legacy/function/CrossMapNormalOp.h
similarity index 100%
rename from paddle/function/CrossMapNormalOp.h
rename to paddle/legacy/function/CrossMapNormalOp.h
diff --git a/paddle/function/CrossMapNormalOpGpu.cu b/paddle/legacy/function/CrossMapNormalOpGpu.cu
similarity index 100%
rename from paddle/function/CrossMapNormalOpGpu.cu
rename to paddle/legacy/function/CrossMapNormalOpGpu.cu
diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/legacy/function/CrossMapNormalOpTest.cpp
similarity index 100%
rename from paddle/function/CrossMapNormalOpTest.cpp
rename to paddle/legacy/function/CrossMapNormalOpTest.cpp
diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/legacy/function/DepthwiseConvOp.cpp
similarity index 100%
rename from paddle/function/DepthwiseConvOp.cpp
rename to paddle/legacy/function/DepthwiseConvOp.cpp
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/legacy/function/DepthwiseConvOp.h
similarity index 100%
rename from paddle/function/DepthwiseConvOp.h
rename to paddle/legacy/function/DepthwiseConvOp.h
diff --git a/paddle/legacy/function/DepthwiseConvOpGpu.cu b/paddle/legacy/function/DepthwiseConvOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..17138cc56390d0fcfb15d4b77a56eda466bcfd3c
--- /dev/null
+++ b/paddle/legacy/function/DepthwiseConvOpGpu.cu
@@ -0,0 +1,376 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DepthwiseConvOp.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+
+namespace paddle {
+
+// CUDA kernel to compute the depthwise convolution forward pass
+template <class T>
+__global__ void ConvolutionDepthwiseForward(const int nthreads,
+                                            const T* const inputData,
+                                            const T* const filterData,
+                                            const int batchSize,
+                                            const int outputChannels,
+                                            const int outputHeight,
+                                            const int outputWidth,
+                                            const int inputChannels,
+                                            const int inputHeight,
+                                            const int inputWidth,
+                                            const int filterMultiplier,
+                                            const int filterHeight,
+                                            const int filterWidth,
+                                            const int strideH,
+                                            const int strideW,
+                                            const int paddingH,
+                                            const int paddingW,
+                                            T* const outputData) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if (index < nthreads) {
+    const int batch = index / outputChannels / outputHeight / outputWidth;
+    const int c_out = (index / outputHeight / outputWidth) % outputChannels;
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+
+    const int c_in = c_out / filterMultiplier;
+    const T* weight = filterData + c_out * filterHeight * filterWidth;
+    T value = 0;
+    const int h_in_start = -paddingH + h_out * strideH;
+    const int w_in_start = -paddingW + w_out * strideW;
+    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
+    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
+    if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) &&
+        (w_in_end < inputWidth)) {
+      for (int kh = 0; kh < filterHeight; ++kh) {
+        for (int kw = 0; kw < filterWidth; ++kw) {
+          const int h_in = -paddingH + h_out * strideH + kh;
+          const int w_in = -paddingW + w_out * strideW + kw;
+          const int offset =
+              ((batch * inputChannels + c_in) * inputHeight + h_in) *
+                  inputWidth +
+              w_in;
+          value += (*weight) * inputData[offset];
+          ++weight;
+        }
+      }
+    } else {
+      for (int kh = 0; kh < filterHeight; ++kh) {
+        for (int kw = 0; kw < filterWidth; ++kw) {
+          const int h_in = -paddingH + h_out * strideH + kh;
+          const int w_in = -paddingW + w_out * strideW + kw;
+          if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
+              (w_in < inputWidth)) {
+            const int offset =
+                ((batch * inputChannels + c_in) * inputHeight + h_in) *
+                    inputWidth +
+                w_in;
+            value += (*weight) * inputData[offset];
+          }
+          ++weight;
+        }
+      }
+    }
+    outputData[index] = value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+template <class T>
+__global__ void ConvolutionDepthwiseInputBackward(const int nthreads,
+                                                  const T* const top_diff,
+                                                  const T* const weight_data,
+                                                  const int num,
+                                                  const int outputChannels,
+                                                  const int outputHeight,
+                                                  const int outputWidth,
+                                                  const int inputChannels,
+                                                  const int inputHeight,
+                                                  const int inputWidth,
+                                                  const int filterMultiplier,
+                                                  const int filterHeight,
+                                                  const int filterWidth,
+                                                  const int strideH,
+                                                  const int strideW,
+                                                  const int paddingH,
+                                                  const int paddingW,
+                                                  T* const bottom_diff) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int batch = index / inputChannels / inputHeight / inputWidth;
+    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
+    const int h_in = (index / inputWidth) % inputHeight;
+    const int w_in = index % inputWidth;
+
+    const int c_out_start = c_in * filterMultiplier;
+
+    int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
+    h_out_start = 0 > h_out_start ? 0 : h_out_start;
+    int h_out_end = (h_in + paddingH) / strideH;
+    h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
+    int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
+    w_out_start = 0 > w_out_start ? 0 : w_out_start;
+    int w_out_end = (w_in + paddingW) / strideW;
+    w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
+
+    T value = 0;
+
+    for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
+         c_out++) {
+      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
+        const int filter_h = h_in + paddingH - h_out * strideH;
+        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
+          const int filter_w = w_in + paddingW - w_out * strideW;
+          const int filter_offset = c_out * filterHeight * filterWidth +
+                                    filter_h * filterWidth + filter_w;
+          const int top_diff_offset =
+              ((batch * outputChannels + c_out) * outputHeight + h_out) *
+                  outputWidth +
+              w_out;
+          value += top_diff[top_diff_offset] * weight_data[filter_offset];
+        }
+      }
+    }
+    bottom_diff[index] += value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
+template <class T>
+__global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
+                                                   const int nthreads,
+                                                   const T* const top_diff,
+                                                   const T* const inputData,
+                                                   const int num,
+                                                   const int outputChannels,
+                                                   const int outputHeight,
+                                                   const int outputWidth,
+                                                   const int inputChannels,
+                                                   const int inputHeight,
+                                                   const int inputWidth,
+                                                   const int filterMultiplier,
+                                                   const int filterHeight,
+                                                   const int filterWidth,
+                                                   const int strideH,
+                                                   const int strideW,
+                                                   const int paddingH,
+                                                   const int paddingW,
+                                                   T* const buffer_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+    const int kh =
+        (index / filterWidth / outputHeight / outputWidth) % filterHeight;
+    const int kw = (index / outputHeight / outputWidth) % filterWidth;
+    const int h_in = -paddingH + h_out * strideH + kh;
+    const int w_in = -paddingW + w_out * strideW + kw;
+    if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
+        (w_in < inputWidth)) {
+      const int c_out =
+          index / (filterHeight * filterWidth * outputHeight * outputWidth);
+      const int c_in = c_out / filterMultiplier;
+      const int batch = num_i;
+      const int top_offset =
+          ((batch * outputChannels + c_out) * outputHeight + h_out) *
+              outputWidth +
+          w_out;
+      const int bottom_offset =
+          ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
+          w_in;
+      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
+    } else {
+      buffer_data[index] = 0;
+    }
+  }
+}
+
+template <class T>
+class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
+ public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData) {
+    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
+
+    size_t blocks = (outputSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    ConvolutionDepthwiseForward<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        outputSize,
+        inputData,
+        filterData,
+        batchSize,
+        outputChannels,
+        outputHeight,
+        outputWidth,
+        inputChannels,
+        inputHeight,
+        inputWidth,
+        filterMultiplier,
+        filterHeight,
+        filterWidth,
+        strideH,
+        strideW,
+        paddingH,
+        paddingW,
+        outputData);
+  }
+};
+
+template <class T>
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
+ public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad) {
+    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
+
+    size_t blocks = (inputSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    ConvolutionDepthwiseInputBackward<T>
+        // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(inputSize,
+                                               outputGrad,
+                                               filterData,
+                                               batchSize,
+                                               outputChannels,
+                                               outputHeight,
+                                               outputWidth,
+                                               inputChannels,
+                                               inputHeight,
+                                               inputWidth,
+                                               filterMultiplier,
+                                               filterHeight,
+                                               filterWidth,
+                                               strideH,
+                                               strideW,
+                                               paddingH,
+                                               paddingW,
+                                               inputGrad);
+  }
+};
+
+template <class T>
+class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
+ public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad) {
+    int colDataSize = outputChannels * filterHeight * filterWidth *
+                      outputHeight * outputWidth;
+
+    size_t blocks = (colDataSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
+                                1,
+                                filterGrad,
+                                false,
+                                true);
+
+    for (int i = 0; i < batchSize; i++) {
+      ConvolutionDepthwiseFilterBackward<
+          T><<<grid, threads, 0, STREAM_DEFAULT>>>(i,
+                                                   colDataSize,
+                                                   outputGrad,
+                                                   inputData,
+                                                   batchSize,
+                                                   outputChannels,
+                                                   outputHeight,
+                                                   outputWidth,
+                                                   inputChannels,
+                                                   inputHeight,
+                                                   inputWidth,
+                                                   filterMultiplier,
+                                                   filterHeight,
+                                                   filterWidth,
+                                                   strideH,
+                                                   strideW,
+                                                   paddingH,
+                                                   paddingW,
+                                                   colData);
+      int K = outputHeight * outputWidth;
+      int M = colDataSize / K;
+
+      BaseMatrix colMatrix(M, K, colData, false, true);
+      filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
+    }
+  }
+};
+
+#ifdef PADDLE_TYPE_DOUBLE
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
+#else
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/legacy/function/DepthwiseConvOpTest.cpp
similarity index 100%
rename from paddle/function/DepthwiseConvOpTest.cpp
rename to paddle/legacy/function/DepthwiseConvOpTest.cpp
diff --git a/paddle/legacy/function/EigenGemm.cpp b/paddle/legacy/function/EigenGemm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5929c5c68ec818c2307580b06f76c63f04e0db5f
--- /dev/null
+++ b/paddle/legacy/function/EigenGemm.cpp
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include "paddle/legacy/function/EigenThreadDevice.h"
+
+namespace paddle {
+
+template <class T>
+struct EigenBlasGemm {
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
+                           Eigen::Aligned>
+      EigenMatrix;
+
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    Eigen::array<int, 2> sizeA;
+    if (transA) {
+      sizeA[0] = K;
+      sizeA[1] = M;
+      CHECK_EQ(M, lda);
+    } else {
+      sizeA[0] = M;
+      sizeA[1] = K;
+      CHECK_EQ(K, lda);
+    }
+    Eigen::array<int, 2> sizeB;
+    if (transB) {
+      sizeB[0] = N;
+      sizeB[1] = K;
+      CHECK_EQ(K, ldb);
+    } else {
+      sizeB[0] = K;
+      sizeB[1] = N;
+      CHECK_EQ(N, ldb);
+    }
+    Eigen::array<int, 2> sizeC = {{M, ldc}};
+    Eigen::array<int, 2> offsetC = {{0, 0}};
+    Eigen::array<int, 2> extentC = {{M, N}};
+
+    const EigenMatrix a(const_cast<T*>(A), sizeA);
+    const EigenMatrix b(const_cast<T*>(B), sizeB);
+    EigenMatrix c(C, sizeC);
+
+    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
+    Eigen::array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    dims[0].first = transA ? 0 : 1;
+    dims[0].second = transB ? 1 : 0;
+
+    auto* device = EigenDeviceWarpper::device();
+    if (N == ldc) {
+      if (alpha == T(1) && beta == T(0)) {
+        c.device(*device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.device(*device) += a.contract(b, dims);
+      } else {
+        c.device(*device) = alpha * a.contract(b, dims) + beta * c;
+      }
+    } else {
+      if (alpha == T(1) && beta == T(0)) {
+        c.slice(offsetC, extentC).device(*device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.slice(offsetC, extentC).device(*device) += a.contract(b, dims);
+      } else {
+        c.slice(offsetC, extentC).device(*device) =
+            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
+      }
+    }
+    EigenDeviceWarpper::free_device(device);
+  }
+};
+
+#ifdef PADDLE_TYPE_DOUBLE
+template struct EigenBlasGemm<double>;
+#else
+template struct EigenBlasGemm<float>;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/EigenThreadDevice.h b/paddle/legacy/function/EigenThreadDevice.h
similarity index 100%
rename from paddle/function/EigenThreadDevice.h
rename to paddle/legacy/function/EigenThreadDevice.h
diff --git a/paddle/function/Function.cpp b/paddle/legacy/function/Function.cpp
similarity index 100%
rename from paddle/function/Function.cpp
rename to paddle/legacy/function/Function.cpp
diff --git a/paddle/legacy/function/Function.h b/paddle/legacy/function/Function.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc6f999a0e08621c1ffbebb51bcfab1e5f9a5630
--- /dev/null
+++ b/paddle/legacy/function/Function.h
@@ -0,0 +1,214 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "BufferArg.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Any.h"
+#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Error.h"
+
+namespace paddle {
+
+/**
+ * Function Configuration.
+ * The argument type of Function::init.
+ */
+class FuncConfig {
+ public:
+  template <typename T>
+  T get(const std::string& key, Error* err = nullptr) const {
+    try {
+      return any_cast<T>(valueMap_.at(key));
+    } catch (std::exception& e) {  // could be cast or out of range exception.
+      if (err) {
+        *err = Error(e.what());
+      } else {
+        LOG(FATAL) << "Cannot get key " << key << " with error " << e.what();
+      }
+      return T();
+    }
+  }
+
+  template <typename T>
+  FuncConfig& set(const std::string& key, T v, Error* err = nullptr) {
+    auto it = valueMap_.find(key);
+    if (it != valueMap_.end()) {  // already contains key.
+      if (err) {
+        *err = Error("Key %s is already set in FuncConfig", key.c_str());
+      } else {
+        LOG(FATAL) << "Key " << key << " is already set in FuncConfig.";
+      }
+      return *this;
+    }
+    valueMap_[key] = any(v);
+    return *this;
+  }
+
+ protected:
+  mutable std::unordered_map<std::string, any> valueMap_;
+};
+
+/**
+ * Argument type for Function::calc().
+ * A BufferArgs contains a set of BufferArg,
+ * because Function can have multiple inputs and outputs.
+ *
+ * addArg() with Matix object used to adapt Layer Argument.
+ * Will create a BufferArg object in addArg(),
+ * and free in destructor of BufferArgs.
+ *
+ * addArg() with BufferArg object, just save BufferArg object address,
+ * and the caller needs to guarantee the validity of the BufferArg object
+ * in the BufferArgs life time.
+ */
+class BufferArgs {
+ public:
+  BufferArgs() {}
+
+  ~BufferArgs() {
+    for (auto arg : _args_) {
+      delete arg;
+    }
+  }
+
+  size_t size() const { return args_.size(); }
+
+  // add argument into BufferArgs
+  // Tensor can be Matrix, Vector, IVector.
+  // For inputs, do not need argType.
+  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
+  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  // Add arg into BufferArgs and reshape the arg.
+  //
+  // For example, arg represents an image buffer,
+  // but Matrix can only represent a two-dimensional Tensor.
+  // So need an extra argument to describe the shape of the image buffer.
+  void addArg(const Matrix& arg,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED);
+
+  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+
+  void addArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED);
+
+  // get argument
+  const BufferArg& operator[](size_t num) const {
+    CHECK_LT(num, args_.size());
+    return *args_[num];
+  }
+
+  void addArg(BufferArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
+
+ private:
+  std::vector<BufferArg*> args_;
+  // The BufferArg object is constructed and freed by BufferArgs.
+  std::vector<BufferArg*> _args_;
+};
+
+/**
+ * \brief Base class for Function.
+ * The basic Function implementation requires override init and calc interfaces.
+ *
+ * The caller needs to ensure the validity of the arguments
+ * during Function execution.
+ *
+ * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
+ * and ADD_TO.
+ * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
+ * result of Function assigned to the output BufferArg.
+ * If output.getArgType() == ADD_TO, this is add mode, and the calculation
+ * result of Function need added to the output BufferArg.
+ *
+ * For example:
+ * ASSIGN_TO: output = Function(inputs)
+ * ADD_TO: output += Function(inputs)
+ * If Function has more than one output, each output can have different modes.
+ */
+class FunctionBase {
+ public:
+  virtual ~FunctionBase() {}
+
+  virtual void init(const FuncConfig& config) {}
+
+  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
+
+  // This member function is used to check whether the BufferType and shape of
+  // the inputs and outputs arguments of the Function are correct.
+  // General calc function which will call this check to do arguments check.
+  // And before the calc called, the caller can also check their own arguments.
+  virtual void check(const BufferArgs& inputs, const BufferArgs& outputs) {}
+
+  // Calculate the number of floating-point operations of this Function.
+  // The inputs and outputs arguments do not need to contain the actual data,
+  // only the shape.
+  // And some Functions have the same input and output shapes,
+  // so you may not need to enter the complete number of arguments.
+  // But entering the full arguments is always correct for this interface.
+  virtual size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) {
+    return 0;
+  }
+
+  int getNumInputs() const { return numInputs_; }
+
+  int getNumOutputs() const { return numOutputs_; }
+
+  static ClassRegistrar<FunctionBase> funcRegistrar_;
+
+ protected:
+  // numInputs_ and numOutputs_ represents the maximum
+  // input and output supported by Function.
+  // Some functions are optimized for input and output,
+  // so when comparing the number of arguments, for these functions
+  // inputs.size() <= numInputs_ or outputs.size() <= numOutputs_
+  size_t numInputs_;
+  size_t numOutputs_;
+};
+
+#define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName
+
+#define REGISTER_TYPED_FUNC(typeName, deviceName, className)   \
+  static InitFunction __reg_type_##typeName##deviceName([]() { \
+    FunctionBase::funcRegistrar_                               \
+        .registerClass<className<DEVICE_TYPE_##deviceName>>(   \
+            FUNC_NAME(typeName, deviceName));                  \
+  })
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/FunctionTest.cpp b/paddle/legacy/function/FunctionTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1a0993e3135bcad9eb8a431e079ed56a267174ea
--- /dev/null
+++ b/paddle/legacy/function/FunctionTest.cpp
@@ -0,0 +1,166 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+
+template <DeviceType DType>
+void FunctionApi(typename Tensor<real, DType>::Matrix& output,
+                 const typename Tensor<real, DType>::Matrix& input);
+
+template <>
+void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 100U);
+  EXPECT_EQ(output.getWidth(), 200U);
+}
+
+template <>
+void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 10U);
+  EXPECT_EQ(output.getWidth(), 20U);
+}
+
+template <DeviceType DType>
+void Function(const BufferArgs& arguments) {
+  const auto input = arguments[0].matrix<DType>();
+  auto output = arguments[1].matrix<DType>();
+  FunctionApi<DType>(output, input);
+}
+
+TEST(Function, BufferArgs) {
+  CpuMatrix cpuInput = CpuMatrix(100, 200);
+  CpuMatrix cpuOutput = CpuMatrix(100, 200);
+  BufferArgs cpuArgments;
+  cpuArgments.addArg(cpuInput);
+  cpuArgments.addArg(cpuOutput);
+  Function<DEVICE_TYPE_CPU>(cpuArgments);
+
+  GpuMatrix gpuInput = GpuMatrix(10, 20);
+  GpuMatrix gpuOutput = GpuMatrix(10, 20);
+  BufferArgs gpuArgments;
+  gpuArgments.addArg(gpuInput);
+  gpuArgments.addArg(gpuOutput);
+  Function<DEVICE_TYPE_GPU>(gpuArgments);
+}
+
+/**
+ * Some tests case are used to check the consistency between the BufferArg type
+ * argument received by Function and the original type argument.
+ *
+ * Use Case:
+ *  TEST() {
+ *    Matrix matrix(...);
+ *    CheckBufferArg lambda = [=](const BufferArg& arg) {
+ *      // check matrix and arg are equivalent
+ *      EXPECT_EQ(matrix, arg);
+ *    }
+ *
+ *   BufferArgs argments{matrix...};
+ *   std::vector<CheckBufferArg> checkFunc{lambda...};
+ *   testBufferArgs(argments, checkFunc);
+ *  }
+ */
+typedef std::function<void(const BufferArg&)> CheckBufferArg;
+
+void testBufferArgs(const BufferArgs& inputs,
+                    const std::vector<CheckBufferArg>& check) {
+  EXPECT_EQ(inputs.size(), check.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    check[i](inputs[i]);
+  }
+}
+
+void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
+  EXPECT_EQ(inputs.size(), 1U);
+  check(inputs[0]);
+}
+
+TEST(Arguments, Matrix) {
+  MatrixPtr matrix = Matrix::create(100, 200);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2U);
+    EXPECT_EQ(arg.shape()[0], 100U);
+    EXPECT_EQ(arg.shape()[1], 200U);
+    EXPECT_EQ(arg.data(), matrix->getData());
+
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*matrix);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, Vector) {
+  VectorPtr vector = Vector::create(100, false);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 1U);
+    EXPECT_EQ(arg.shape()[0], 100U);
+    EXPECT_EQ(arg.data(), vector->getData());
+
+    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
+    EXPECT_EQ(inVector.getSize(), vector->getSize());
+    EXPECT_EQ(inVector.getData(), vector->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*vector);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, CpuSparseMatrix) {
+  CpuSparseMatrix sparse(200, 300, 50);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2U);
+    EXPECT_EQ(arg.shape()[0], 200U);
+    EXPECT_EQ(arg.shape()[1], 300U);
+    EXPECT_EQ(arg.data(), sparse.getData());
+    // CHECK_EQ(arg.sparse().nnz(), 50);
+    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
+  };
+
+  BufferArgs argments;
+  argments.addArg(sparse);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, BufferArg) {
+  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 3U);
+    EXPECT_EQ(arg.shape()[0], 1U);
+    EXPECT_EQ(arg.shape()[1], 2U);
+    EXPECT_EQ(arg.shape()[2], 3U);
+  };
+
+  BufferArgs argments;
+  argments.addArg(arg);
+  testBufferArgs(argments, check);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/FunctionTest.h b/paddle/legacy/function/FunctionTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f01981a34bff0a7d9bb04d0a0012117ecf5f803
--- /dev/null
+++ b/paddle/legacy/function/FunctionTest.h
@@ -0,0 +1,410 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/math/tests/TensorCheck.h"
+#include "paddle/testing/TestUtil.h"
+
+namespace paddle {
+
+typedef std::shared_ptr<BufferArg> BufferArgPtr;
+
+namespace test {
+template <DeviceType DType>
+struct Allocator;
+
+template <>
+struct Allocator<DEVICE_TYPE_CPU> {
+  using type = CpuMemoryHandle;
+};
+
+template <>
+struct Allocator<DEVICE_TYPE_GPU> {
+  using type = GpuMemoryHandle;
+};
+
+// Copy argument1 to argument2
+template <DeviceType DType1, DeviceType DType2>
+class CopyArgument {
+ public:
+  void operator()(const BufferArg& arg1, BufferArg& arg2) {
+    CHECK_EQ(arg1.valueType(), arg2.valueType());
+    CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
+
+    if (arg1.valueType() == VALUE_TYPE_INT32) {
+      IVectorPtr vector1 =
+          IVector::create((int*)arg1.data(),
+                          arg1.shape().getElements(),
+                          DType1 == DEVICE_TYPE_CPU ? false : true);
+      IVectorPtr vector2 =
+          IVector::create((int*)arg2.data(),
+                          arg2.shape().getElements(),
+                          DType2 == DEVICE_TYPE_CPU ? false : true);
+      vector2->copyFrom(*vector1);
+    } else {
+      VectorPtr vector1 =
+          Vector::create((real*)arg1.data(),
+                         arg1.shape().getElements(),
+                         DType1 == DEVICE_TYPE_CPU ? false : true);
+      VectorPtr vector2 =
+          Vector::create((real*)arg2.data(),
+                         arg2.shape().getElements(),
+                         DType2 == DEVICE_TYPE_CPU ? false : true);
+      vector2->copyFrom(*vector1);
+    }
+  }
+};
+}  // namespace test
+
+/**
+ * \brief A class for comparing two Functions of different implementations.
+ *        For example, can be used to compare the CPU and GPU implementation
+ *        of the function is consistent.
+ *
+ * Use case:
+ *  // Initializes a test object, the corresponding cpu and gpu Function
+ *  // are constructed according to FunctionName and FuncConfig.
+ *  CpuGpuFuncCompare test(FunctionName, FuncConfig);
+ *  // Prepare inputs and outputs arguments.
+ *  // Here the input and output can not contain real data,
+ *  // only contains the argument type and shape.
+ *  test.addInputs(input1);
+ *  test.addInputs(input2);
+ *  test.addOutputs(output1);
+ *  test.addOutputs(output2);
+ *  // Run.
+ *  // Will according to the type and shape of arguments(inputs_/outputs_),
+ *  // automatic initialization cpu and gpu function required arguments
+ *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
+ *  // Call the CPU and GPU Function calculation results.
+ *  // Compares CPU and GPU calculation results for consistency.
+ *  test.run();
+ */
+template <DeviceType DType1, DeviceType DType2>
+class Compare2Function {
+ public:
+  typedef typename test::Allocator<DType1>::type Allocator1;
+  typedef typename test::Allocator<DType2>::type Allocator2;
+  typedef typename Tensor<real, DType1>::Vector Vector1;
+  typedef typename Tensor<real, DType2>::Vector Vector2;
+  typedef typename Tensor<real, DType1>::SparseMatrix SparseMatrix1;
+  typedef typename Tensor<real, DType2>::SparseMatrix SparseMatrix2;
+
+  Compare2Function(const std::string& name1,
+                   const std::string& name2,
+                   const FuncConfig& config)
+      : function1_(FunctionBase::funcRegistrar_.createByType(name1)),
+        function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
+    function1_->init(config);
+    function2_->init(config);
+    initArgsCallback_ = nullptr;
+  }
+
+  ~Compare2Function() {}
+
+  // input need only contains shape, do not contains data.
+  void addInputs(const BufferArg& input) {
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
+
+    func1Inputs_.emplace_back(std::make_shared<BufferArg>(
+        func1Memory_.back()->getBuf(), input.valueType(), input.shape()));
+    func2Inputs_.emplace_back(std::make_shared<BufferArg>(
+        func2Memory_.back()->getBuf(), input.valueType(), input.shape()));
+  }
+
+  // assume one copy of sequence is shared by different SequenceArgs
+  void addSequence(const SequenceIdArg& input) {
+    CHECK_EQ(input.shape().ndims(), 1UL);
+    size_t batchSize = input.shape()[0];
+    size_t numSeqs = batchSize / 10 + 1;
+    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(sizeId));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(sizeId));
+    seq1_ = std::make_shared<SequenceIdArg>(func1Memory_.back()->getBuf(),
+                                            TensorShape{numSeqs + 1});
+    seq2_ = std::make_shared<SequenceIdArg>(func2Memory_.back()->getBuf(),
+                                            TensorShape{numSeqs + 1});
+    /// init sequence Id
+    initArg(*seq1_, batchSize);
+
+    copyArg_(*seq1_, *seq2_);
+  }
+
+  void addInputs(const SequenceArg& input) {
+    CHECK_EQ(input.shape().ndims(), 2UL);
+    size_t batchSize = input.shape()[0];
+    if (!seq1_ || !seq2_) {  // sequence not exist
+      addSequence(SequenceIdArg(TensorShape{batchSize}));
+    }
+
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
+
+    /// SequenceArg
+    func1Inputs_.emplace_back(
+        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
+                                      input.valueType(),
+                                      input.shape(),
+                                      *seq1_));
+    func2Inputs_.emplace_back(
+        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
+                                      input.valueType(),
+                                      input.shape(),
+                                      *seq2_));
+  }
+
+  void registerInitCallback(std::function<void(BufferArg&, size_t)> callback) {
+    initArgsCallback_ = callback;
+  }
+
+  // output need only contains shape, do not contains data.
+  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
+
+    func1Outputs_.emplace_back(
+        std::make_shared<BufferArg>(func1Memory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    argType));
+    func2Outputs_.emplace_back(
+        std::make_shared<BufferArg>(func2Memory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    argType));
+  }
+
+  /// add and init output sparse matrix
+  void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
+    sparse1_ = std::make_shared<SparseMatrix1>(
+        output.shape()[0],
+        output.shape()[1],
+        output.nnz(),
+        static_cast<SparseValueType>(output.dataType()),
+        static_cast<SparseFormat>(output.dataFormat()));
+
+    sparse2_ = std::make_shared<SparseMatrix2>(
+        output.shape()[0],
+        output.shape()[1],
+        output.nnz(),
+        static_cast<SparseValueType>(output.dataType()),
+        static_cast<SparseFormat>(output.dataFormat()));
+
+    /// init sparse matrix
+    hl_stream_t stream(HPPL_STREAM_1);
+    sparse1_->randomizeUniform();
+    sparse2_->copyFrom(*sparse1_, stream);
+    hl_stream_synchronize(stream);
+
+    func1Outputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*sparse1_, argType));
+    func2Outputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*sparse2_, argType));
+  }
+
+  void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
+    CHECK_EQ(output.shape().ndims(), 2UL);
+    size_t batchSize = output.shape()[0];
+
+    if (!seq1_ || !seq2_) {  // sequence not exist
+      addSequence(SequenceIdArg(TensorShape{batchSize}));
+    }
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
+
+    /// SequenceArg
+    func1Outputs_.emplace_back(
+        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
+                                      output.valueType(),
+                                      output.shape(),
+                                      *seq1_,
+                                      argType));
+    func2Outputs_.emplace_back(
+        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
+                                      output.valueType(),
+                                      output.shape(),
+                                      *seq2_,
+                                      argType));
+  }
+
+  void addInputs(const SparseMatrixArg& input) {
+    sparse1_ = std::make_shared<SparseMatrix1>(
+        input.shape()[0],
+        input.shape()[1],
+        input.nnz(),
+        static_cast<SparseValueType>(input.dataType()),
+        static_cast<SparseFormat>(input.dataFormat()));
+
+    sparse2_ = std::make_shared<SparseMatrix2>(
+        input.shape()[0],
+        input.shape()[1],
+        input.nnz(),
+        static_cast<SparseValueType>(input.dataType()),
+        static_cast<SparseFormat>(input.dataFormat()));
+
+    /// init sparse matrix
+    hl_stream_t stream(HPPL_STREAM_1);
+    sparse1_->randomizeUniform();
+    sparse2_->copyFrom(*sparse1_, stream);
+    hl_stream_synchronize(stream);
+
+    func1Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse1_));
+    func2Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse2_));
+  }
+
+  void run() {
+    // prepare cpu/gpu arguments
+    initInputs();
+
+    initOutputs();
+    // function calculate
+    auto callFunction = [](FunctionBase* function,
+                           std::vector<BufferArgPtr>& inputs,
+                           std::vector<BufferArgPtr>& outputs) {
+      BufferArgs inArgs;
+      BufferArgs outArgs;
+      for (auto arg : inputs) {
+        inArgs.addArg(*arg);
+      }
+      for (auto arg : outputs) {
+        outArgs.addArg(*arg);
+      }
+      function->calc(inArgs, outArgs);
+    };
+
+    callFunction(function1_.get(), func1Inputs_, func1Outputs_);
+    callFunction(function2_.get(), func2Inputs_, func2Outputs_);
+
+    // check outputs
+    compareOutputs();
+  }
+
+  std::shared_ptr<FunctionBase> getFunction1() const { return function1_; }
+
+  std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
+
+ protected:
+  // only init cpu argument, gpu argument copy from cpu argument.
+  void initArg(BufferArg& arg) {
+    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
+
+  void initArg(SequenceArg& arg) {
+    /// init only matrix
+    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
+
+  void initArg(SequenceIdArg& arg, size_t batchSize) {
+    size_t numSeqs = arg.numSeqs();
+    int* buf = reinterpret_cast<int*>(arg.data());
+    int pos = 0;
+    size_t maxLen = 2 * batchSize / numSeqs;
+    for (int i = 0; i < (int)numSeqs; ++i) {
+      int len = 1 + uniformRandom(std::min<int64_t>(
+                        maxLen, batchSize - pos - numSeqs + i));
+      buf[i] = pos;
+      pos += len;
+      VLOG(1) << " len=" << len;
+    }
+    buf[numSeqs] = batchSize;
+  }
+
+  void initInputs() {
+    for (size_t i = 0; i < func1Inputs_.size(); i++) {
+      if (func1Inputs_[i]->isSparseArg()) {
+        continue;  /// sparse matrix already init
+      }
+
+      if (func1Inputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*func1Inputs_[i]));
+      } else {
+        initArg(*func1Inputs_[i]);
+      }
+
+      if (initArgsCallback_ != nullptr) {
+        initArgsCallback_(*func1Inputs_[i], i);
+      }
+
+      copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
+    }
+  }
+
+  void initOutputs() {
+    for (size_t i = 0; i < func1Outputs_.size(); i++) {
+      if (func1Outputs_[i]->isSparseArg()) {
+        continue;  /// sparse matrix already init
+      }
+
+      if (func1Outputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*func1Outputs_[i]));
+      } else {
+        initArg(*func1Outputs_[i]);
+      }
+
+      copyArg_(*func1Outputs_[i], *func2Outputs_[i]);
+    }
+  }
+
+  void compareOutputs() {
+    for (size_t i = 0; i < func1Outputs_.size(); i++) {
+      // TODO, Need a BufferCheck used to compare the two buffers.
+      const auto cpu = func1Outputs_[i];
+      const auto gpu = func2Outputs_[i];
+      CHECK_EQ(cpu->numElements(), gpu->numElements());
+      Vector1 cpuVector(cpu->numElements(), (real*)cpu->data());
+      Vector2 gpuVector(gpu->numElements(), (real*)gpu->data());
+      autotest::TensorCheckErr(cpuVector, gpuVector);
+    }
+  }
+
+ protected:
+  std::shared_ptr<FunctionBase> function1_;
+  std::shared_ptr<FunctionBase> function2_;
+  std::vector<std::shared_ptr<Allocator1>> func1Memory_;
+  std::vector<std::shared_ptr<Allocator2>> func2Memory_;
+  std::vector<BufferArgPtr> func1Inputs_;
+  std::vector<BufferArgPtr> func1Outputs_;
+  std::vector<BufferArgPtr> func2Inputs_;
+  std::vector<BufferArgPtr> func2Outputs_;
+  std::shared_ptr<SparseMatrix1> sparse1_;
+  std::shared_ptr<SparseMatrix2> sparse2_;
+  std::shared_ptr<SequenceIdArg> seq1_;
+  std::shared_ptr<SequenceIdArg> seq2_;
+  test::CopyArgument<DType1, DType2> copyArg_;
+  std::function<void(BufferArg&, size_t)> initArgsCallback_;
+};
+
+class CpuGpuFuncCompare
+    : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
+ public:
+  CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
+      : Compare2Function(name + "-CPU", name + "-GPU", config) {}
+
+  ~CpuGpuFuncCompare() {}
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/GemmConvOp.cpp b/paddle/legacy/function/GemmConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5a81315661dc2843a648315ca4a6b590f217a657
--- /dev/null
+++ b/paddle/legacy/function/GemmConvOp.cpp
@@ -0,0 +1,522 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvOp.h"
+#include "GemmFunctor.h"
+#include "Im2Col.h"
+#include "paddle/legacy/math/MemoryHandle.h"
+
+namespace paddle {
+
+/*
+ * \brief Forward calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColFunctor<kCFO, Device, real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          im2col(inputData + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
+        int M = outputChannels / groups_;
+        int N = outputHeight * outputWidth;
+        int K = inputChannels / groups_ * filterHeight * filterWidth;
+        BlasGemm<Device, real>::compute(false,
+                                        false,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        filterData + g * filterOffset,
+                                        K,
+                                        colData,
+                                        N,
+                                        beta,
+                                        outputData + g * outputOffset,
+                                        N);
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifdef PADDLE_MOBILE_INFERENCE
+
+/*
+ * \brief Forward calculation of convolution, optimized for mobile.
+ */
+template <DeviceType Device>
+class GemmConvMobileFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    real* colData = NULL;
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+    TensorShape colShape;
+
+    // Max col matrix width 4096, Max col matrix size 4M.
+    size_t outputHeightSteps =
+        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
+    size_t maxColWidth = outputHeightSteps * outputWidth;
+    size_t channelSteps =
+        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
+                          (size_t)1),
+                 inputChannels / groups_);
+    size_t maxColHeight = channelSteps * filterHeight * filterWidth;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+
+      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColMobileFunctor<real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    int nStride = outputHeight * outputWidth;
+    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
+    for (size_t i = 0; i < batchSize; i++) {
+      filterData = inputs[1].data<real>();
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          real beta_ = beta;
+          for (size_t ic = 0; ic < inputChannels / groups_;
+               ic += channelSteps) {
+            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
+            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
+              int height = std::min(outputHeight - oh, outputHeightSteps);
+
+              int M = outputChannels / groups_;
+              int N = height * outputWidth;
+              int K = channels * filterHeight * filterWidth;
+              // im2col
+              im2col(inputData,
+                     imShape,
+                     colData,
+                     colShape,
+                     strideH(),
+                     strideW(),
+                     paddingH(),
+                     paddingW(),
+                     dilationH(),
+                     dilationW(),
+                     channels,
+                     oh,
+                     height,
+                     N);
+
+              // gemm
+              BlasGemm<Device, real>::compute(
+                  false,
+                  false,
+                  M,
+                  N,
+                  K,
+                  1.0f,
+                  filterData + ic * filterHeight * filterWidth,
+                  kStride,
+                  colData,
+                  N,
+                  beta_,
+                  outputData + oh * outputWidth,
+                  nStride);
+            }
+            beta_ = 1.0;
+          }
+        } else {
+          int M = outputChannels / groups_;
+          int N = outputHeight * outputWidth;
+          int K = inputChannels / groups_ * filterHeight * filterWidth;
+          BlasGemm<Device, real>::compute(false,
+                                          false,
+                                          M,
+                                          N,
+                                          K,
+                                          1.0f,
+                                          filterData,
+                                          K,
+                                          inputData,
+                                          N,
+                                          beta,
+                                          outputData,
+                                          N);
+        }
+        inputData += inputOffset;
+        outputData += outputOffset;
+        filterData += filterOffset;
+      }
+    }
+
+    memory_.reset();
+  }
+};
+
+#endif
+
+/*
+ * \brief Backward input calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvGradInputFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* outputGrad = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* inputGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Col2ImFunctor<kCFO, Device, real> col2im;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        int K = outputChannels / groups_;
+        int N = outputHeight * outputWidth;
+        int M = inputChannels / groups_ * filterHeight * filterWidth;
+        real scale = 0.0f;
+        if (!needIm2col) {
+          colData = inputGrad + g * inputOffset;
+          scale = 1.0f;
+        }
+        BlasGemm<Device, real>::compute(true,
+                                        false,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        filterData + g * filterOffset,
+                                        M,
+                                        outputGrad + g * outputOffset,
+                                        N,
+                                        scale,
+                                        colData,
+                                        N);
+        if (needIm2col) {
+          col2im(inputGrad + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
+        }
+      }
+      inputGrad += inputChannels * inputHeight * inputWidth;
+      outputGrad += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+/*
+ * \brief Backward filter calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvGradFilterFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* outputGrad = inputs[0].data<real>();
+    real* inputData = inputs[1].data<real>();
+    real* filterGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColFunctor<kCFO, Device, real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          im2col(inputData + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
+        int M = outputChannels / groups_;
+        int K = outputHeight * outputWidth;
+        int N = inputChannels / groups_ * filterHeight * filterWidth;
+        BlasGemm<Device, real>::compute(false,
+                                        true,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        outputGrad + g * outputOffset,
+                                        K,
+                                        colData,
+                                        K,
+                                        i == 0 ? beta : 1.0f,
+                                        filterGrad + g * filterOffset,
+                                        N);
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputGrad += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifdef PADDLE_MOBILE_INFERENCE
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
+#else
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
+#endif
+REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
+REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
+REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
+REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/GemmConvOpTest.cpp b/paddle/legacy/function/GemmConvOpTest.cpp
similarity index 100%
rename from paddle/function/GemmConvOpTest.cpp
rename to paddle/legacy/function/GemmConvOpTest.cpp
diff --git a/paddle/legacy/function/GemmFunctor.cpp b/paddle/legacy/function/GemmFunctor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..450293dfeea170e287cfc90226dabad25c76e537
--- /dev/null
+++ b/paddle/legacy/function/GemmFunctor.cpp
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GemmFunctor.h"
+#include "paddle/legacy/math/MathFunctions.h"
+
+namespace paddle {
+
+template <class T>
+struct BlasGemm<DEVICE_TYPE_CPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+#ifdef PADDLE_USE_EIGEN_FOR_BLAS
+    EigenBlasGemm<T>::compute(
+        transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+#else
+    gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
+            transB == false ? CblasNoTrans : CblasTrans,
+            M,
+            N,
+            K,
+            alpha,
+            A,
+            lda,
+            B,
+            ldb,
+            beta,
+            C,
+            ldc);
+#endif
+  }
+};
+
+template <class T>
+struct BlasGemm<DEVICE_TYPE_GPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    hl_matrix_mul((T*)A,
+                  transA == false ? HPPL_OP_N : HPPL_OP_T,
+                  (T*)B,
+                  transB == false ? HPPL_OP_N : HPPL_OP_T,
+                  C,
+                  M,
+                  N,
+                  K,
+                  alpha,
+                  beta,
+                  lda,
+                  ldb,
+                  ldc);
+  }
+};
+
+template struct BlasGemm<DEVICE_TYPE_CPU, real>;
+template struct BlasGemm<DEVICE_TYPE_GPU, real>;
+
+}  // namespace paddle
diff --git a/paddle/function/GemmFunctor.h b/paddle/legacy/function/GemmFunctor.h
similarity index 100%
rename from paddle/function/GemmFunctor.h
rename to paddle/legacy/function/GemmFunctor.h
diff --git a/paddle/function/GruFunctor.h b/paddle/legacy/function/GruFunctor.h
similarity index 100%
rename from paddle/function/GruFunctor.h
rename to paddle/legacy/function/GruFunctor.h
diff --git a/paddle/function/Im2Col.h b/paddle/legacy/function/Im2Col.h
similarity index 100%
rename from paddle/function/Im2Col.h
rename to paddle/legacy/function/Im2Col.h
diff --git a/paddle/function/Im2ColOp.cpp b/paddle/legacy/function/Im2ColOp.cpp
similarity index 100%
rename from paddle/function/Im2ColOp.cpp
rename to paddle/legacy/function/Im2ColOp.cpp
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/legacy/function/Im2ColOpGpu.cu
similarity index 100%
rename from paddle/function/Im2ColOpGpu.cu
rename to paddle/legacy/function/Im2ColOpGpu.cu
diff --git a/paddle/legacy/function/Im2ColTest.cpp b/paddle/legacy/function/Im2ColTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c5f06f38991497963cfbe1e12825f1bc39dffa6
--- /dev/null
+++ b/paddle/legacy/function/Im2ColTest.cpp
@@ -0,0 +1,223 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+#include <gtest/gtest.h>
+#include "Function.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/tests/TensorCheck.h"
+
+namespace paddle {
+
+template <DeviceType Device, class T>
+void TestIm2ColFunctor() {
+  for (size_t channels : {1, 5, 32}) {
+    for (size_t inputHeight : {5, 33, 100}) {
+      for (size_t inputWidth : {5, 32, 96}) {
+        for (size_t filterHeight : {1, 5}) {
+          for (size_t filterWidth : {3, 7}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+                  TensorShape colShape2 = TensorShape({outputHeight,
+                                                       outputWidth,
+                                                       channels,
+                                                       filterHeight,
+                                                       filterWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(width, height, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, Device, T> im2Col1;
+                  Im2ColFunctor<kOCF, Device, T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+
+                  // The transposition of the result of ColFormat == kCFO
+                  // is equal to the result of ColFormat == kOCF.
+                  MatrixPtr test;
+                  output2->transpose(test, true);
+                  autotest::TensorCheckErr(*output1, *test);
+
+                  Col2ImFunctor<kCFO, Device, T> col2Im1;
+                  Col2ImFunctor<kOCF, Device, T> col2Im2;
+
+                  col2Im1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  col2Im2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  autotest::TensorCheckErr(*input1, *input2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
+
+#ifdef PADDLE_WITH_CUDA
+
+TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
+
+#endif
+
+template <class T>
+void TestIm2ColMobileFunctor() {
+  for (size_t channels : {32}) {
+    for (size_t inputHeight : {33, 100}) {
+      for (size_t inputWidth : {32, 96}) {
+        for (size_t filterHeight : {5}) {
+          for (size_t filterWidth : {7}) {
+            for (size_t stride : {2}) {
+              for (size_t padding : {1}) {
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(height, width, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
+                  Im2ColMobileFunctor<T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation,
+                          channels,
+                          0,
+                          outputHeight,
+                          outputHeight * outputWidth);
+
+                  autotest::TensorCheckEqual(*output1, *output2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/MulOp.cpp b/paddle/legacy/function/MulOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..140103175290703e1a0c171d8f45cdc59a1f6912
--- /dev/null
+++ b/paddle/legacy/function/MulOp.cpp
@@ -0,0 +1,347 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MulOp.h"
+#include "GemmFunctor.h"
+#include "paddle/legacy/math/SIMDFunctions.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace {
+inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i] += (1.0 == scaleB) ? b[i] : scaleB * b[i];
+  }
+}
+
+inline void colVecAddTo(
+    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i * aWidth] += (1.0 == c) ? b[i * bWidth] : b[i * bWidth] * c;
+  }
+}
+}  // namespace
+
+namespace paddle {
+/// sparse matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK_EQ(out.getValueType(), FLOAT_VALUE);
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  const real* A = a.getData();
+  const real* B = b.getData();
+  real* C = out.getValue();
+  int* rows = out.getRows();
+  int* cols = out.getCols();
+  size_t width = out.getWidth();
+  size_t height = out.getHeight();
+
+  /// SPARSE_CSC, {a any, b not trans}
+  if (out.getFormat() == SPARSE_CSC) {
+    /// b not trans and a any
+    CHECK(!bTrans);
+    size_t m = !aTrans ? a.getWidth() : a.getHeight();
+    for (size_t i = 0; i < width; i++) {
+      size_t start = out.getColStartIdx(i);
+      size_t end = out.getColStartIdx(i + 1);
+      for (size_t j = start; j < end; j++) {
+        real sum = 0;
+        size_t rowIdx = rows[j];
+        for (size_t k = 0; k < m; k++) {
+          sum += (!aTrans ? A[rowIdx * m + k] : A[k * height + rowIdx]) *
+                 B[k * width + i];
+        }
+        C[j] = scaleAB * sum + scaleT * C[j];
+      }
+    }
+    return;
+  }
+
+  /// SPARSE_CSR, {a any, b not trans} or {a not trans, b trans}
+  if (out.getFormat() == SPARSE_CSR) {
+    /// a and b can not both transpose
+    CHECK(!(aTrans && bTrans));
+    size_t m = a.getWidth();
+    for (size_t i = 0; i < height; i++) {
+      size_t start = out.getRowStartIdx(i);
+      size_t end = out.getRowStartIdx(i + 1);
+      for (size_t j = start; j < end; j++) {
+        real sum = 0;
+        size_t colIdx = cols[j];
+        for (size_t k = 0; k < m; k++) {
+          sum += (!aTrans ? A[i * m + k] : A[k * height + i]) *
+                 (!bTrans ? B[k * width + colIdx] : B[colIdx * m + k]);
+        }
+        C[j] = scaleAB * sum + scaleT * C[j];
+      }
+    }
+    return;
+  }
+}
+
+/// dense matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
+      aTrans,
+      bTrans,
+      out.getHeight(),
+      out.getWidth(),
+      !aTrans ? a.getWidth() : a.getHeight(),
+      scaleAB,
+      a.getData(),
+      a.getStride(),
+      b.getData(),
+      b.getStride(),
+      scaleT,
+      out.getData(),
+      out.getStride());
+}
+
+/// dense matrix (+)= sparse matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuSparseMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  const real* B = b.getData();
+  real* C = out.getData();
+  if (out.getWidth() % 32 == 0) {
+    CHECK_EQ((size_t)B % 32, 0UL);
+    CHECK_EQ((size_t)C % 32, 0UL);
+  }
+
+  int* cols = a.getCols();
+  real* values = a.getValue();
+  for (size_t i = 0; i < a.getHeight(); ++i) {
+    const int start = a.getRowStartIdx(i);
+    const int end = a.getRowStartIdx(i + 1);
+    for (int j = start; j < end; ++j) {
+      vecAddTo(!aTrans ? out.getRow(i) : out.getRow(cols[j]),
+               !aTrans ? const_cast<CpuMatrix&>(b).getRow(cols[j])
+                       : const_cast<CpuMatrix&>(b).getRow(i),
+               (a.getValueType() == FLOAT_VALUE) ? values[j] : (real)1.0,
+               out.getWidth());
+    }
+  }
+}
+
+/// dense matrix (+)= dense matrix * sparse matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuSparseMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  real* A = const_cast<real*>(a.getData());
+  real* B = const_cast<real*>(b.getValue());
+  real* C = out.getData();
+  int* rows = b.getRows();
+  int* cols = b.getCols();
+
+  /// SPARSE_CSC format
+  if (b.getFormat() == SPARSE_CSC) {
+    for (size_t j = 0; j < b.getWidth(); ++j) {
+      int start = b.getColStartIdx(j);
+      int end = b.getColStartIdx(j + 1);
+      for (int i = start; i < end; ++i) {
+        colVecAddTo(!bTrans ? C + j : C + rows[i],
+                    !bTrans ? A + rows[i] : A + j,
+                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
+                    out.getHeight(),
+                    out.getWidth(),
+                    a.getWidth());
+      }
+    }
+    return;
+  }
+
+  /// SPARSE_CSR format
+  if (b.getFormat() == SPARSE_CSR) {
+    for (size_t j = 0; j < b.getHeight(); ++j) {
+      int start = b.getRowStartIdx(j);
+      int end = b.getRowStartIdx(j + 1);
+      for (int i = start; i < end; ++i) {
+        colVecAddTo(!bTrans ? C + cols[i] : C + j,
+                    !bTrans ? A + j : A + cols[i],
+                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
+                    out.getHeight(),
+                    out.getWidth(),
+                    a.getWidth());
+      }
+    }
+    return;
+  }
+}
+
+/**
+ * mul operator
+ * out = scaleT * out + scaleAB * (A * B)
+ * here, scaleT in {0, 1}, scaleAB == 1,
+ * out = A * B, ASSIGN_TO
+ * out += A * B, ADD_TO
+ *
+ *
+ * \param outputs[0]      output matrix (out), M * N,
+ *                        could be either Sparse or Dense Matrix
+ *                        M is num of rows, N is num of columns
+ * \param inputs[0]       first input matrix (A),  M * K (if non-trans)
+ *                        could be either Sparse or Dense Matrix
+ *                        M is num of rows, K is num of columns
+ * \param inputs[1]       second input matrix (B), K * N (if non-trans)
+ *                        could be either Sparse or Dense Matrix
+ *                        K is num of rows, N is num of columns
+ *
+ * Support eight Mul operators, with both GPU and CPU devices
+ * For each device, four Mul operators are supported:
+ * 1. dense (out) = dense (A) * dense (B)
+ * 2. dense (out) = sparse (A) * dense (B)
+ *    sparse matrix only support SPARSE_CSR format
+ * 3. dense (out) = dense (A) * sparse (B)
+ *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
+ * 4. sparse (out) = dense (A) * dense (B)
+ *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
+ *
+ */
+template <DeviceType Device>
+class MulFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    aTrans_ = config.get<bool>("aTrans");
+    bTrans_ = config.get<bool>("bTrans");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK(!aTrans_ || !bTrans_)
+        << "Not support both a and b are transpose matrices";
+
+    CHECK_EQ((size_t)2, inputs.size());
+    CHECK_EQ((size_t)1, outputs.size());
+    CHECK(inputs[0].data() && inputs[1].data() && outputs[0].data());
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
+    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
+
+    size_t aRow = !aTrans_ ? inputs[0].shape()[0] : inputs[0].shape()[1];
+    size_t aCol = !aTrans_ ? inputs[0].shape()[1] : inputs[0].shape()[0];
+    size_t bRow = !bTrans_ ? inputs[1].shape()[0] : inputs[1].shape()[1];
+    size_t bCol = !bTrans_ ? inputs[1].shape()[1] : inputs[1].shape()[0];
+    /// C = A * B, or C += A * B, for matrix format
+    CHECK_EQ(aCol, bRow);
+    CHECK_EQ(aRow, outputs[0].shape()[0]);
+    CHECK_EQ(bCol, outputs[0].shape()[1]);
+
+    /// only support C = A * B (ASSIGN_TO) or C += A * B (ADD_TO)
+    real scaleT = (outputs[0].getArgType() == ADD_TO) ? 1.0 : 0.0;
+
+    /// support dense = not both sparse * sparse
+    /// or sparse = dense * dense
+    CHECK((!outputs[0].isSparseArg() &&
+           !(inputs[0].isSparseArg() && inputs[1].isSparseArg())) ||
+          (outputs[0].isSparseArg() && !inputs[0].isSparseArg() &&
+           !inputs[1].isSparseArg()));
+
+    auto outMat = outputs[0].matrix<Device>();
+    /// dense matrix = dense matrix * dense matrix
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      MulOp<Device>(outMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// dense matrix = dense matrix * sparse matrix
+    if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      CHECK(!aTrans_) << "Not supported a transpose";
+      MulOp<Device>(outMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].sparse().SparseMatrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// dense matrix = sparse matrix * dense matrix
+    if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      CHECK(!bTrans_) << "Not supported b transpose";
+      CHECK_EQ(inputs[0].sparse().dataFormat(), T_SPARSE_CSR)
+          << "Only supported SPARSE_CSR format for sparse matrix a";
+      MulOp<Device>(outMat,
+                    inputs[0].sparse().SparseMatrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// sparse matrix = dense matrix * dense matrix
+    auto outSparseMat = outputs[0].sparse().SparseMatrix<Device>();
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        outputs[0].isSparseArg()) {
+      MulOp<Device>(outSparseMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+  }
+
+ private:
+  bool aTrans_;
+  bool bTrans_;
+};
+
+REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/legacy/function/MulOp.h b/paddle/legacy/function/MulOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab33bde17296cd2b17ac45c5a936cfd2727919a5
--- /dev/null
+++ b/paddle/legacy/function/MulOp.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+/// CPU, dense matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, dense matrix (+)= sparse matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuSparseMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, dense matrix (+)= dense matrix * sparse matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuMatrix& a,
+           const CpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, sparse matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuSparseMatrix& out,
+           const CpuMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= sparse matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuSparseMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= dense matrix * sparse matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, sparse matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuSparseMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/MulOpGpu.cu b/paddle/legacy/function/MulOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..217c983cb75dfcbc0e17f752a66847c5e92fcc91
--- /dev/null
+++ b/paddle/legacy/function/MulOpGpu.cu
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MulOp.h"
+#include "hl_base.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+/// dense matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_matrix_mul(const_cast<real*>(a.getData()),
+                !aTrans ? HPPL_OP_N : HPPL_OP_T,
+                const_cast<real*>(b.getData()),
+                !bTrans ? HPPL_OP_N : HPPL_OP_T,
+                const_cast<real*>(out.getData()),
+                out.getHeight(),
+                out.getWidth(),
+                !aTrans ? a.getWidth() : a.getHeight(),
+                scaleAB,
+                scaleT,
+                a.getStride(),
+                b.getStride(),
+                out.getStride());
+}
+
+/// dense matrix (+)= sparse matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuSparseMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(out.isContiguous());
+  CHECK(b.isContiguous());
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_matrix_csr_mul_dense(a.sMatrix_.get(),
+                          aTrans ? HPPL_OP_T : HPPL_OP_N,
+                          const_cast<real*>(b.getData()),
+                          HPPL_OP_N,
+                          const_cast<real*>(out.getData()),
+                          out.getHeight(),
+                          out.getWidth(),
+                          b.getHeight(),
+                          scaleAB,
+                          scaleT);
+}
+
+/// dense matrix (+)= dense matrix * sparse matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuSparseMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(out.isContiguous());
+  CHECK(a.isContiguous());
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+
+  if (b.format_ == SPARSE_CSC) {
+    hl_matrix_dense_mul_csc(const_cast<real*>(a.getData()),
+                            HPPL_OP_N,
+                            b.sMatrix_.get(),
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
+                            const_cast<real*>(out.getData()),
+                            out.getHeight(),
+                            out.getWidth(),
+                            a.getWidth(),
+                            scaleAB,
+                            scaleT);
+  } else {
+    hl_matrix_dense_mul_csr(const_cast<real*>(a.getData()),
+                            HPPL_OP_N,
+                            b.sMatrix_.get(),
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
+                            const_cast<real*>(out.getData()),
+                            out.getHeight(),
+                            out.getWidth(),
+                            a.getWidth(),
+                            scaleAB,
+                            scaleT);
+  }
+}
+
+/// sparse matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_sparse_matrix_mul(const_cast<real*>(a.getData()),
+                       aTrans ? HPPL_OP_T : HPPL_OP_N,
+                       const_cast<real*>(b.getData()),
+                       bTrans ? HPPL_OP_T : HPPL_OP_N,
+                       out.sMatrix_.get(),
+                       out.getHeight(),
+                       out.getWidth(),
+                       !bTrans ? b.getHeight() : b.getWidth(),
+                       scaleAB,
+                       scaleT);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/MulOpTest.cpp b/paddle/legacy/function/MulOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab08b6f8696ff4aefd2dbdda591b20730b46898c
--- /dev/null
+++ b/paddle/legacy/function/MulOpTest.cpp
@@ -0,0 +1,212 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/math/tests/test_matrixUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+/**
+ *  C += A * B, A, B, C dense matrix
+ *  dense = dense * dense
+ */
+void testFuncDDDMatrix(
+    bool transa, bool transb, size_t dimM, size_t dimN, size_t dimK) {
+  real scaleT = 1.0;
+  size_t heightA = (transa == false) ? dimM : dimK;
+  size_t widthA = (transa == false) ? dimK : dimM;
+  size_t heightB = (transb == false) ? dimK : dimN;
+  size_t widthB = (transb == false) ? dimN : dimK;
+  size_t heightC = dimM;
+  size_t widthC = dimN;
+  // init Test object
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
+  // prepare input arguments
+  /// matrix A : HA * WA
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}));
+  /// matrix B: HB * WB
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}));
+
+  /// output matrix C: HC * WC
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, DDDMatrixMul) {
+  LOG(INFO) << "function test for dense = dense * dense matrix";
+  for (const auto transa : {false, true}) {
+    for (const auto transb : {false, true}) {
+      for (const auto dimM : {1, 10, 100}) {
+        for (const auto dimN : {1, 10}) {
+          for (const auto dimK : {8}) {
+            if (transa && transb) {
+              continue;
+            }
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " transa=" << transa << " transb=" << transb
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK;
+            testFuncDDDMatrix(transa, transb, dimM, dimN, dimK);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * C += A * B, B, C dense, A sparse
+ * dense = sparse * dense
+ */
+void testFuncDSparseDMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// sparse matrix A : M * K
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE));
+  /// matrix B: K * N
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
+
+  /// output matrix C: M * N
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MuLOp, DSparseDMul) {
+  LOG(INFO) << "function test for dense = sparse * dense matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSR}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncDSparseDMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * C += A * B, A, C dense, B sparse
+ * dense = dense * sparse
+ */
+void testFuncDDSparseMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// matrix A : M * K
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
+
+  /// matrix B: K * N
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE));
+
+  /// output matrix C: M * N
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, DDSparseMul) {
+  LOG(INFO) << "function test for dense = dense * sparse matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSR, SPARSE_CSC}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncDDSparseMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * C += A * B, A sparse, B, C dense
+ * sparse = dense * dense
+ */
+void testFuncSparseDDMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// matrix A : M * K
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
+
+  /// matrix B: K * N
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
+
+  /// output sparse matrix C: M * N
+  test.addOutputs(
+      SparseMatrixArg(
+          VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE),
+      scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, SparseDDMul) {
+  LOG(INFO) << "function test for sparse = dense * dense matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSC, SPARSE_CSR}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncSparseDDMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/legacy/function/NaiveConvOp.cpp
similarity index 100%
rename from paddle/function/NaiveConvOp.cpp
rename to paddle/legacy/function/NaiveConvOp.cpp
diff --git a/paddle/legacy/function/PadOp.cpp b/paddle/legacy/function/PadOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d011d28e6938fac6980bed88f774abdbf3532d4
--- /dev/null
+++ b/paddle/legacy/function/PadOp.cpp
@@ -0,0 +1,215 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PadOp.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void Pad<DEVICE_TYPE_CPU>(real* outputs,
+                          const real* inputs,
+                          const int num,
+                          const int inC,
+                          const int inH,
+                          const int inW,
+                          const PadConf& pad) {
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  for (int i = 0; i < num; i++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int inoff = ((i * inC + c) * inH + h) * inW;
+        int outoff =
+            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
+        memcpy(outputs + outoff, inputs + inoff, inW * sizeof(real));
+      }
+    }
+  }
+}
+
+template <>
+void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
+                              const real* outGrad,
+                              const int num,
+                              const int inC,
+                              const int inH,
+                              const int inW,
+                              const PadConf& pad) {
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  for (int i = 0; i < num; i++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int inoff = ((i * inC + c) * inH + h) * inW;
+        int outoff =
+            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
+        CpuVector inG = CpuVector(inW, inGrad + inoff);
+        CpuVector outG = CpuVector(inW, const_cast<real*>(outGrad + outoff));
+        inG += outG;
+      }
+    }
+  }
+}
+
+static inline PadConf castToPadConf(const FuncConfig& conf) {
+  return {conf.get<std::vector<uint32_t>>("channel"),
+          conf.get<std::vector<uint32_t>>("height"),
+          conf.get<std::vector<uint32_t>>("width")};
+}
+
+/**
+ * \brief Padding zeros to input according to the specify dimension.
+ *        The struct pad_ contains the padding size in each dimension.
+ *        The input and output is a 4D tensor. In PadFunc, we only
+ *        pad zeros to the 2nd to 4th dimension.
+ *
+ * Argument in this Function:
+ * \param pad_    A struct object contains the padding size in each dimension.
+ *                It has six integers. The channelStart and channelEnd indicate
+ *                how many zeros to add before and after the input in channel
+ *                dimension. And the heightStart and heightEnd indicate padding
+ *                in height dimension. The widthStart and widthEnd indicate the
+ *                padding in width dimension.
+ * \param inputs  A 4D tensor, only one input.
+ * \param outputs A 4D tensor, the output value after padding.
+ *
+ * For example,
+ * Input(2,2,2,3) = [
+ *                    [ [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]] ],
+ *                    [ [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]] ]
+ *                  ] # the shape is (1,2,2,3)
+ *
+ * pad_: if channelStart = channelEnd = 1, others are 0.
+ * Output(2,4,2,3) = [
+ *                    [ [[0,0,0], [0,0,0]],
+ *                      [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]],
+ *                      [[0,0,0], [0,0,0]] ],
+ *                    [ [[0,0,0], [0,0,0]],
+ *                      [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]],
+ *                      [[0,0,0], [0,0,0]] ]
+ *                   ] # the shape is (2,4,2,3)
+ *
+ * pad_: if widthStart = 1, widthEnd = 2, others are 0.
+ * Output(2,2,2,6) = [
+ *                     [ [[0,1,2,3,0,0], [0,3,4,5,0,0]],
+ *                       [[0,2,3,5,0,0], [0,1,6,7,0,0]] ],
+ *                     [ [[0,4,3,1,0,0], [0,1,8,7,0,0]],
+ *                       [[0,3,8,9,0,0], [0,2,3,5,0,0]] ],
+ *                   ] # the shape is (2,2,2,6)
+ *
+ * pad_: if heightStart = 1, heightEnd = 1, others are 0.
+ * Output(2,2,4,3) = [
+ *                     [ [[0,0,0], [1,2,3], [3,4,5], [0,0,0]],
+ *                       [[0,0,0], [2,3,5], [1,6,7], [0,0,0]] ],
+ *                     [ [[0,0,0], [4,3,1], [1,8,7], [0,0,0]],
+ *                       [[0,0,0], [3,8,9], [2,3,5], [0,0,0]] ],
+ *                   ] # the shape is (2,2,4,3)
+ */
+
+template <DeviceType Device>
+class PadFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    size_t num = inputs[0].shape()[0];
+    size_t inC = inputs[0].shape()[1];
+    size_t inH = inputs[0].shape()[2];
+    size_t inW = inputs[0].shape()[3];
+    typename Tensor<real, Device>::Vector vec(outputs[0].shape().getElements(),
+                                              outputs[0].data<real>());
+    vec.zero();
+
+    Pad<Device>(outputs[0].data<real>(),
+                inputs[0].data<real>(),
+                num,
+                inC,
+                inH,
+                inW,
+                pad_);
+  }
+
+ private:
+  PadConf pad_;
+};
+
+/**
+ * \brief The backward propagation of padding Function. Remove the elements
+ *        in the padding positions of forward.
+ *
+ * Argument in this Function:
+ * \param pad_    The same meaning as it in PadFunc.
+ * \param inputs  The gradient with respect to the output value of PadFunc.
+ * \param outputs The gradient with respect to the input value of PadFunc.
+ */
+
+template <DeviceType Device>
+class PadGradFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+
+    size_t num = outputs[0].shape()[0];
+    size_t inC = outputs[0].shape()[1];
+    size_t inH = outputs[0].shape()[2];
+    size_t inW = outputs[0].shape()[3];
+
+    if (outputs[0].getArgType() != ADD_TO) {
+      // for unit test
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
+
+    PadGrad<Device>(outputs[0].data<real>(),
+                    inputs[0].data<real>(),
+                    num,
+                    inC,
+                    inH,
+                    inW,
+                    pad_);
+  }
+
+ private:
+  PadConf pad_;
+};
+
+REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
+REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
+REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/PadOp.h b/paddle/legacy/function/PadOp.h
similarity index 100%
rename from paddle/function/PadOp.h
rename to paddle/legacy/function/PadOp.h
diff --git a/paddle/function/PadOpGpu.cu b/paddle/legacy/function/PadOpGpu.cu
similarity index 100%
rename from paddle/function/PadOpGpu.cu
rename to paddle/legacy/function/PadOpGpu.cu
diff --git a/paddle/function/PadOpTest.cpp b/paddle/legacy/function/PadOpTest.cpp
similarity index 100%
rename from paddle/function/PadOpTest.cpp
rename to paddle/legacy/function/PadOpTest.cpp
diff --git a/paddle/legacy/function/RowConvOp.cpp b/paddle/legacy/function/RowConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3be50e80d71fabdb3e7a22bfc061da09412c132d
--- /dev/null
+++ b/paddle/legacy/function/RowConvOp.cpp
@@ -0,0 +1,225 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RowConvOp.h"
+#include <iostream>
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void RowConv<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                              const CpuMatrix& in,
+                              const CpuMatrix& filter,
+                              const CpuIVector& seq) {
+  const int* starts = seq.getData();
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  for (size_t i = 0; i < numSeq; ++i) {
+    size_t begin = starts[i];
+    size_t end = starts[i + 1];
+    for (size_t j = begin; j < end; ++j) {
+      MatrixPtr x;
+      MatrixPtr w;
+      if ((j + contextLength) < end) {
+        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, contextLength);
+        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, contextLength);
+      } else {
+        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, end - j);
+        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, end - j);
+      }
+      MatrixPtr y = out.subMatrix(j, 1);
+      y->addDotMulVMM(*x, *w);
+    }
+  }
+}
+
+template <>
+void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
+                                  const CpuMatrix& in,
+                                  const CpuMatrix& filter,
+                                  CpuMatrix& inG,
+                                  CpuMatrix& filterG,
+                                  const CpuIVector& seq) {
+  // gradient w.r.t filter
+  const int* starts = seq.getData();
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  if (filterG) {
+    for (size_t i = 0; i < numSeq; ++i) {
+      size_t begin = starts[i];
+      size_t end = starts[i + 1];
+      size_t steps = end - begin;
+      for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) {
+        MatrixPtr x =
+            (const_cast<CpuMatrix&>(in)).subMatrix(begin + j, steps - j);
+        MatrixPtr dy =
+            (const_cast<CpuMatrix&>(outG)).subMatrix(begin, steps - j);
+        MatrixPtr dw = filterG.subMatrix(j, 1);
+        dw->addDotMulVMM(*dy, *x);
+      }
+    }
+  }
+
+  // gradient w.r.t input feature
+  if (inG) {
+    for (size_t i = 0; i < numSeq; ++i) {
+      size_t begin = starts[i];
+      size_t end = starts[i + 1];
+      size_t steps = end - begin;
+      for (size_t j = 0; j < steps; ++j) {
+        MatrixPtr dx = inG.subMatrix(begin + j, 1);
+        for (size_t t = 0; t < contextLength; ++t) {
+          if (int(j - t) >= 0) {
+            MatrixPtr dy =
+                (const_cast<CpuMatrix&>(outG)).subMatrix(begin + j - t, 1);
+            MatrixPtr w = (const_cast<CpuMatrix&>(filter)).subMatrix(t, 1);
+            dx->addDotMul(*dy, *w, 1.0, 1.0);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief The row convolution is called lookahead convolution. It is firstly
+ * introduced in deep-speech2 system. The bidirectional RNN that learns
+ * representation for a sequence by performing a forward and a backward pass
+ * through the entire sequence. However, unlike unidirectional RNNs,
+ * bidirectional RNNs are challenging to deploy in an online and low-latency
+ * setting. The lookahead convolution incorporates information from future
+ * subsequences in a computationally efficient manner to improve unidirectional
+ * recurrent neural networks.
+ *
+ * The connection of row convolution is different form the 1D sequence
+ * convolution. Assumed that, the future context-length is k, that is to say,
+ * it can get the output at timestep t by using the the input feature from t-th
+ * timestep to (t+k)-th timestep. Assumed that the hidden dim of input
+ * activations are d, the activations r_t for the new layer at time-step t are:
+ *
+ *
+ *            -- k + 1
+ *  r(t,i) =  >       W(i,j) * h(t+j-1, i),  for (1 <= i <= d)
+ *            -- j = 1
+ *
+ *
+ * The weight shape is: (k + 1) x d
+ * Function Arguments:
+ *
+ * \param inputs[0]  The input activations.
+ * \param inputs[0]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[1] The output activations.
+ *
+ * [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
+ * English
+ *     and Mandarin. https://arxiv.org/abs/1512.02595
+ */
+
+template <DeviceType Device>
+class RowConvFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    // TODO(qingqing): support ASSIGN_TO.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
+    const auto in = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto w = inputs[1];
+    CHECK(in.data() && out.data() && in.getSequenceId().data());
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK(in.shape() == out.shape());
+    CHECK_EQ(w.shape()[1], in.shape()[1]);
+
+    auto outMat = out.matrix<Device>();
+    const auto inMat = in.matrix<Device>();
+    const auto wMat = w.matrix<Device>();
+    const auto seqId = in.getSequenceId().vector<int, Device>();
+
+    RowConv<Device>(outMat, inMat, wMat, seqId);
+  }
+};
+
+/**
+ * \brief The backward of row convolution function. This function calculated
+ * the gradient w.r.t filter and the gradient w.r.t input activations(or data).
+ *
+ * Argument in this Function:
+ *
+ * \param inputs[0]  The gradient w.r.t output activations.
+ * \param inputs[1]  The input activations.
+ * \param inputs[2]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[0] The gradient w.r.t input activations.
+ * \param outputs[1] The gradient w.r.r filter.
+ *
+ * Abbreviation:
+ * w.r.t: with respect to.
+ */
+
+template <DeviceType Device>
+class RowConvGradFunc : public FunctionBase {
+  // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
+ public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(3UL, inputs.size());
+    CHECK_EQ(2UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() &&
+          outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
+
+    const auto outGrad = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto in = dynamic_cast<const SequenceArg&>(inputs[1]);
+    const auto w = inputs[2];
+    auto inGrad = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto wGrad = outputs[1];
+
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK(in.shape() == inGrad.shape());
+    CHECK(in.shape() == outGrad.shape());
+    CHECK_EQ(wGrad.shape()[1], in.shape()[1]);
+
+    const auto outGMat = outGrad.matrix<Device>();
+    const auto inMat = in.matrix<Device>();
+    const auto wMat = w.matrix<Device>();
+    auto inGMat = inGrad.data()
+                      ? inGrad.matrix<Device>()
+                      : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    auto wGMat = wGrad.data()
+                     ? wGrad.matrix<Device>()
+                     : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seqId = in.getSequenceId().vector<int, Device>();
+
+    RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
+  }
+};
+
+REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/RowConvOp.h b/paddle/legacy/function/RowConvOp.h
similarity index 100%
rename from paddle/function/RowConvOp.h
rename to paddle/legacy/function/RowConvOp.h
diff --git a/paddle/legacy/function/RowConvOpGpu.cu b/paddle/legacy/function/RowConvOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a6d2e4c7e38b12bcd448a85f9e74df226e6984af
--- /dev/null
+++ b/paddle/legacy/function/RowConvOpGpu.cu
@@ -0,0 +1,373 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/cuda/include/hl_base.h"
+#include "paddle/legacy/function/RowConvOp.h"
+
+namespace paddle {
+
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConv(real* y,
+                          const real* x,
+                          const real* w,
+                          const int* starts,
+                          const int height,
+                          const int width,
+                          const int numSeq,
+                          const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context; ++t) {
+        if ((start + j + t) < end) {
+          int xoff = off + t * width;
+          real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+          sum += sw[t][tidx] * xVal;
+        }
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+__global__ void KeRowConv2(real* y,
+                           const real* x,
+                           const real* w,
+                           const int* starts,
+                           const int height,
+                           const int width,
+                           const int numSeq,
+                           const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      int off = (start + j) * width;
+      real sum = 0;
+      for (int t = 0; t < context && (start + j + t) < end; ++t) {
+        int xoff = off + t * width;
+        real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+        real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wd * xd;
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+template <>
+void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,  // NOLINT
+                              const GpuMatrix& in,
+                              const GpuMatrix& filter,
+                              const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+
+  real* y = out.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+
+  dim3 dimBlock(32, 32);
+  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+
+  if (contextLength <= 32) {
+    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+        y, x, w, starts, height, width, numSeq, contextLength);
+  } else {
+    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+        y, x, w, starts, height, width, numSeq, contextLength);
+  }
+  CHECK_SYNC("RowConv");
+}
+
+template <int BLOCK_H, int BLOCK_W, int CONTEXT>
+__global__ void KeRowConvBwWeight(real* dw,
+                                  const real* x,
+                                  const real* dy,
+                                  const int* starts,
+                                  const int height,
+                                  const int width,
+                                  const int numSeq,
+                                  const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sh_x[BLOCK_W][BLOCK_H];
+  __shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1];
+  __shared__ real sh_dw[CONTEXT][BLOCK_W];
+
+  if (tidy < context) {
+    sh_dw[tidy][tidx] = 0.0;
+  }
+  __syncthreads();
+
+  // NOTE(zcd): temporary solution
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+
+      // transpose
+      sh_x[tidx][tidy] =
+          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] =
+          (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      __syncthreads();
+      if (tidy < (context - 1)) {
+        yoff = yoff - context + 1;
+        sh_dy[tidx][tidy] =
+            (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+      }
+      __syncthreads();
+
+      for (int t = 0; t < context; t++) {
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
+        __syncthreads();
+        // warp size and blockDim.x is 32.
+
+        for (int offset = 16; offset > 0; offset /= 2)
+          val += __shfl_down_sync(mask, val, offset);
+
+        __syncthreads();
+        if (tidx == 0) {
+          sh_dw[t][tidy] += val;
+        }
+        __syncthreads();
+      }
+    }
+  }
+
+  for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) {
+    dw[t * width + gidx + tidx] += sh_dw[t][tidx];
+  }
+}
+
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwWeight2(real* dw,
+                                   const real* x,
+                                   const real* dy,
+                                   const int* starts,
+                                   const int height,
+                                   const int width,
+                                   const int numSeq,
+                                   const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sh_x[BLOCK_H][BLOCK_W];
+  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
+
+  // NOTE(zcd): temporary solution
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+
+    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+
+      // transpose
+      sh_x[tidx][tidy] =
+          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      __syncthreads();
+
+      for (int t = 0; t < context; t++) {
+        sh_dy[tidx][tidy] =
+            (xoff < width && (yoff - t) >= start && yoff - t < end)
+                ? dy[(yoff - t) * width + xoff]
+                : 0.0;
+        __syncthreads();
+
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
+        __syncthreads();
+        // warp size and blockDim.x is 32.
+        for (int offset = 16; offset > 0; offset /= 2)
+          val += __shfl_down_sync(mask, val, offset);
+
+        __syncthreads();
+
+        if (tidx == 0 && (gidx + tidy) < width) {
+          dw[t * width + gidx + tidy] += val;
+        }
+      }
+    }
+  }
+}
+
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwData(real* dx,
+                                const real* w,
+                                const real* dy,
+                                const int* starts,
+                                const int height,
+                                const int width,
+                                const int numSeq,
+                                const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        sum += sw[t][tidx] * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+__global__ void KeRowConvBwData2(real* dx,
+                                 const real* w,
+                                 const real* dy,
+                                 const int* starts,
+                                 const int height,
+                                 const int width,
+                                 const int numSeq,
+                                 const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wVal * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+template <>
+void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
+                                  const GpuMatrix& in,
+                                  const GpuMatrix& filter,
+                                  GpuMatrix& inG,      // NOLINT
+                                  GpuMatrix& filterG,  // NOLINT
+                                  const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+
+  const real* dy = outG.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+
+  if (filterG) {
+    dim3 dimBlock(32, 32);
+    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+    real* dw = filterG.getData();
+    if (contextLength <= 32) {
+      KeRowConvBwWeight<32, 32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+          dw, x, dy, starts, height, width, numSeq, contextLength);
+    } else {
+      KeRowConvBwWeight2<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+          dw, x, dy, starts, height, width, numSeq, contextLength);
+    }
+  }
+
+  if (inG) {
+    real* dx = inG.getData();
+    dim3 dimBlock2(32, 32);
+    dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
+    if (contextLength <= 64) {
+      KeRowConvBwData<32, 64><<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
+          dx, w, dy, starts, height, width, numSeq, contextLength);
+    } else {
+      KeRowConvBwData2<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
+          dx, w, dy, starts, height, width, numSeq, contextLength);
+    }
+  }
+
+  CHECK_SYNC("RowConvGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/RowConvOpTest.cpp b/paddle/legacy/function/RowConvOpTest.cpp
similarity index 100%
rename from paddle/function/RowConvOpTest.cpp
rename to paddle/legacy/function/RowConvOpTest.cpp
diff --git a/paddle/legacy/function/ScaleSubRegionOp.cpp b/paddle/legacy/function/ScaleSubRegionOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..03a422a740dca4499532cdb1bdfbf3d3ab272a9a
--- /dev/null
+++ b/paddle/legacy/function/ScaleSubRegionOp.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "paddle/legacy/function/TensorShape.h"
+
+namespace paddle {
+
+template <>
+void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
+                                     const real* inputs,
+                                     const real* indices,
+                                     const TensorShape shape,
+                                     const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
+
+  for (int n = 0; n < number; ++n) {
+    // indices start from 1
+    int offset = n * 6;
+    for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
+      for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
+        for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          outputs[idx] *= value;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
+                                         real* outGrad,
+                                         const real* indices,
+                                         const TensorShape shape,
+                                         const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  for (int n = 0; n < number; ++n) {
+    for (int c = 0; c < channel; ++c) {
+      for (int h = 0; h < height; ++h) {
+        for (int w = 0; w < width; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          int offset = n * 6;
+          if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+              h >= (indices[offset + 2] - 1) &&
+              h <= (indices[offset + 3] - 1) &&
+              w >= (indices[offset + 4] - 1) &&
+              w <= (indices[offset + 5] - 1)) {
+            outGrad[idx] += inGrad[idx] * value;
+          } else {
+            outGrad[idx] += inGrad[idx];
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief For each instance, ScaleSubRegion can be used to multiply a value to
+ *        a specified sub continuous region. By providing start index and end
+ *        index for C/H/W, you can specify the location and shape of the region.
+ *
+ * Argument in this Function:
+ * \param inputs    A 4-D tensor with shape [N, C, H, W], only one input.
+ * \param indices   A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs   A 4-D tensor with same shape as inputs, output value.
+ */
+template <DeviceType Device>
+class ScaleSubRegionFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegion<Device>(outputs[0].data<real>(),
+                           inputs[0].data<real>(),
+                           inputs[1].data<real>(),
+                           shape,
+                           conf_);
+  }
+
+ private:
+  FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of ScaleSubRegion Function.
+ *
+ * Argument in this Function:
+ * \param inputs  A 4-D tensor with shape [N, C, H, W], output gradient.
+ * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
+ */
+
+template <DeviceType Device>
+class ScaleSubRegionGradFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
+                               outputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               shape,
+                               conf_);
+  }
+
+ private:
+  FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOp.h b/paddle/legacy/function/ScaleSubRegionOp.h
similarity index 100%
rename from paddle/function/ScaleSubRegionOp.h
rename to paddle/legacy/function/ScaleSubRegionOp.h
diff --git a/paddle/function/ScaleSubRegionOpGpu.cu b/paddle/legacy/function/ScaleSubRegionOpGpu.cu
similarity index 100%
rename from paddle/function/ScaleSubRegionOpGpu.cu
rename to paddle/legacy/function/ScaleSubRegionOpGpu.cu
diff --git a/paddle/function/ScaleSubRegionOpTest.cpp b/paddle/legacy/function/ScaleSubRegionOpTest.cpp
similarity index 100%
rename from paddle/function/ScaleSubRegionOpTest.cpp
rename to paddle/legacy/function/ScaleSubRegionOpTest.cpp
diff --git a/paddle/legacy/function/SwitchOp.cpp b/paddle/legacy/function/SwitchOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6accd18039180aa521c18193e576d22e11f5a97
--- /dev/null
+++ b/paddle/legacy/function/SwitchOp.cpp
@@ -0,0 +1,140 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SwitchOp.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void NCHW2NHWC<DEVICE_TYPE_CPU>(real* outputs,
+                                const real* inputs,
+                                const int num,
+                                const int inC,
+                                const int inH,
+                                const int inW,
+                                const int argType) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < inC; ++c) {
+      for (int h = 0; h < inH; ++h) {
+        for (int w = 0; w < inW; ++w) {
+          if (argType == ADD_TO) {
+            outputs[((n * inH + h) * inW + w) * inC + c] += *(inputs++);
+          } else {
+            outputs[((n * inH + h) * inW + w) * inC + c] = *(inputs++);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <>
+void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
+                                const real* inputs,
+                                const int num,
+                                const int inH,
+                                const int inW,
+                                const int inC,
+                                const int argType) {
+  for (int n = 0; n < num; ++n) {
+    for (int h = 0; h < inH; ++h) {
+      for (int w = 0; w < inW; ++w) {
+        for (int c = 0; c < inC; ++c) {
+          if (argType == ADD_TO) {
+            outputs[((n * inC + c) * inH + h) * inW + w] += *(inputs++);
+          } else {
+            outputs[((n * inC + c) * inH + h) * inW + w] = *(inputs++);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief  Switch dimension order of image input.
+ *         The input and output is a 4D tensor. Switch order
+ *         'batch_size,channels, height, width' to
+ *         order 'batch_size, height, width, channels'.
+ *
+ * Argument in this Function:
+ * \param inputs  input data with order 'batch_size,channels, height, width'.
+ * \param outputs output data with order 'batch_size, height, width, channels'.
+ */
+template <DeviceType Device>
+class NCHW2NHWCFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+
+    size_t num = inputs[0].shape()[0];
+    size_t inC = inputs[0].shape()[1];
+    size_t inH = inputs[0].shape()[2];
+    size_t inW = inputs[0].shape()[3];
+    NCHW2NHWC<Device>(outputs[0].data<real>(),
+                      inputs[0].data<real>(),
+                      num,
+                      inC,
+                      inH,
+                      inW,
+                      outputs[0].getArgType());
+  }
+};
+
+/**
+ * \brief  Switch dimension order of image input.
+ *         The input and output is a 4D tensor. Switch order
+ *         'batch_size, height, width, channels' to
+ *         order 'batch_size, channels, height, width'.
+ *
+ * Argument in this Function:
+ * \param inputs  input data with order 'batch_size, height, width, channels'.
+ * \param outputs output data with order 'batch_size, channels, height, width'.
+ */
+template <DeviceType Device>
+class NHWC2NCHWFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+
+    size_t num = inputs[0].shape()[0];
+    size_t inH = inputs[0].shape()[1];
+    size_t inW = inputs[0].shape()[2];
+    size_t inC = inputs[0].shape()[3];
+
+    NHWC2NCHW<Device>(outputs[0].data<real>(),
+                      inputs[0].data<real>(),
+                      num,
+                      inH,
+                      inW,
+                      inC,
+                      outputs[0].getArgType());
+  }
+};
+
+REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
+REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
+REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/SwitchOp.h b/paddle/legacy/function/SwitchOp.h
similarity index 100%
rename from paddle/function/SwitchOp.h
rename to paddle/legacy/function/SwitchOp.h
diff --git a/paddle/function/SwitchOpGpu.cu b/paddle/legacy/function/SwitchOpGpu.cu
similarity index 100%
rename from paddle/function/SwitchOpGpu.cu
rename to paddle/legacy/function/SwitchOpGpu.cu
diff --git a/paddle/function/SwitchOpTest.cpp b/paddle/legacy/function/SwitchOpTest.cpp
similarity index 100%
rename from paddle/function/SwitchOpTest.cpp
rename to paddle/legacy/function/SwitchOpTest.cpp
diff --git a/paddle/function/TensorShape.h b/paddle/legacy/function/TensorShape.h
similarity index 100%
rename from paddle/function/TensorShape.h
rename to paddle/legacy/function/TensorShape.h
diff --git a/paddle/function/TensorShapeTest.cpp b/paddle/legacy/function/TensorShapeTest.cpp
similarity index 100%
rename from paddle/function/TensorShapeTest.cpp
rename to paddle/legacy/function/TensorShapeTest.cpp
diff --git a/paddle/legacy/function/TensorType.h b/paddle/legacy/function/TensorType.h
new file mode 100644
index 0000000000000000000000000000000000000000..13994821be7ba7264f43d8550e6800cdc5b93875
--- /dev/null
+++ b/paddle/legacy/function/TensorType.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+enum ValueType {
+  VALUE_TYPE_INT32 = 0,
+  VALUE_TYPE_FLOAT = 1,
+  VALUE_TYPE_DOUBLE = 2,
+  VALUE_TYPE_BYTE = 3
+};
+
+enum DeviceType {
+  DEVICE_TYPE_UNSPECIFIED = 0,
+  DEVICE_TYPE_CPU = 1,
+  DEVICE_TYPE_GPU = 2
+};
+
+enum SparseDataType { T_NO_VALUE = 0, T_FLOAT_VALUE = 1 };
+
+enum SparseDataFormat { T_SPARSE_CSR = 0, T_SPARSE_CSC = 1 };
+
+inline int sizeOfValuType(ValueType valueType) {
+  if (valueType == VALUE_TYPE_INT32) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_FLOAT) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_DOUBLE) {
+    return 8;
+  } else {
+    LOG(FATAL) << "Unknown type: " << valueType;
+    return 0;
+  }
+}
+
+template <typename T>
+struct DataType;
+
+template <>
+struct DataType<float> {
+  static const ValueType value = VALUE_TYPE_FLOAT;
+};
+
+template <>
+struct DataType<double> {
+  static const ValueType value = VALUE_TYPE_DOUBLE;
+};
+
+template <>
+struct DataType<int> {
+  static const ValueType value = VALUE_TYPE_INT32;
+};
+
+namespace detail {
+
+template <typename VType, DeviceType Device>
+struct MatrixT;
+
+template <>
+struct MatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuMatrix;
+};
+
+template <>
+struct MatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuMatrix;
+};
+
+template <>
+struct MatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
+
+template <>
+struct MatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+
+template <typename VType, DeviceType Device>
+struct SparseMatrixT;
+
+template <>
+struct SparseMatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuSparseMatrix;
+};
+
+template <>
+struct SparseMatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuSparseMatrix;
+};
+
+template <>
+struct SparseMatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
+
+template <>
+struct SparseMatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+
+template <typename VType, DeviceType Device>
+struct VectorT;
+
+template <>
+struct VectorT<real, DEVICE_TYPE_CPU> {
+  using type = CpuVector;
+};
+
+template <>
+struct VectorT<real, DEVICE_TYPE_GPU> {
+  using type = GpuVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_CPU> {
+  using type = CpuIVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_GPU> {
+  using type = GpuIVector;
+};
+
+}  // namespace detail
+
+template <typename VType, DeviceType DType>
+struct Tensor {
+  typedef typename detail::VectorT<VType, DType>::type Vector;
+  typedef typename detail::MatrixT<VType, DType>::type Matrix;
+  typedef typename detail::SparseMatrixT<VType, DType>::type SparseMatrix;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/legacy/function/TensorTypeTest.cpp
similarity index 100%
rename from paddle/function/TensorTypeTest.cpp
rename to paddle/legacy/function/TensorTypeTest.cpp
diff --git a/paddle/legacy/function/neon/NeonDepthwiseConv.cpp b/paddle/legacy/function/neon/NeonDepthwiseConv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6179635a9fec4afecf53fabdc6a818588b54c808
--- /dev/null
+++ b/paddle/legacy/function/neon/NeonDepthwiseConv.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NeonDepthwiseConv.h"
+#include "paddle/legacy/function/ConvOp.h"
+
+namespace paddle {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+template <DeviceType Device>
+class NeonDepthwiseConvFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    int batchSize = input[0];
+    int inputChannels = input[1];
+    int inputHeight = input[2];
+    int inputWidth = input[3];
+    int filterHeight = getFilterHeight(filter);
+    int filterWidth = getFilterWidth(filter);
+    int outputChannels = output[1];
+    int outputHeight = output[2];
+    int outputWidth = output[3];
+    int filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(static_cast<size_t>(inputChannels), groups_);
+
+    // only support strideH() == strideW() and filterHeight == filterWidth.
+    CHECK_EQ(strideH(), strideW());
+    CHECK_EQ(filterHeight, filterWidth);
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    // padding the input
+    float* inputPadding = inputData;
+    int padInputHeight = inputHeight + 2 * paddingH();
+    int padInputWidth = inputWidth + 2 * paddingW();
+    int newSize =
+        batchSize * (inputChannels + 1) * padInputHeight * padInputWidth;
+
+    resizeBuffer<Device>(newSize);
+    inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+    neon::Padding<float>::run(inputData,
+                              inputPadding,
+                              batchSize * inputChannels,
+                              inputHeight,
+                              inputWidth,
+                              padInputHeight,
+                              padInputWidth);
+
+    std::function<void(
+        const float*, const float*, int, int, int, int, int, int, float*)>
+        DepthWiseConv;
+
+    if (filterWidth == 3 && strideW() == 1) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
+    } else if (filterWidth == 3 && strideW() == 2) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 2>::run;
+    } else if (filterWidth == 4 && strideW() == 1) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
+    } else if (filterWidth == 4 && strideW() == 2) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 2>::run;
+    } else {
+      LOG(FATAL) << "Not supported";
+    }
+
+    for (int i = 0; i < batchSize; i++) {
+      DepthWiseConv(inputPadding,
+                    filterData,
+                    padInputHeight,
+                    padInputWidth,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    filterMultiplier,
+                    outputData);
+      inputPadding += inputChannels * padInputHeight * padInputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifndef PADDLE_TYPE_DOUBLE
+REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction);
+#endif
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/neon/NeonDepthwiseConv.h b/paddle/legacy/function/neon/NeonDepthwiseConv.h
similarity index 100%
rename from paddle/function/neon/NeonDepthwiseConv.h
rename to paddle/legacy/function/neon/NeonDepthwiseConv.h
diff --git a/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..feb77e1ff9f591d63dbf86a05313d65025f7c65d
--- /dev/null
+++ b/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NeonDepthwiseConv.h"
+#include "paddle/legacy/function/ConvOp.h"
+
+namespace paddle {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+template <DeviceType Device>
+class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    int batchSize = input[0];
+    int inputChannels = input[1];
+    int inputHeight = input[2];
+    int inputWidth = input[3];
+    int filterHeight = getFilterHeight(filter);
+    int filterWidth = getFilterWidth(filter);
+    int outputChannels = output[1];
+    int outputHeight = output[2];
+    int outputWidth = output[3];
+    int filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    // only support strideH() == strideW() and filterHeight == filterWidth.
+    CHECK_EQ(strideH(), strideW());
+    CHECK_EQ(paddingH(), paddingW());
+    CHECK_EQ(filterHeight, filterWidth);
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    // padding the input, input -> inputPadding
+    float* inputPadding = inputData;
+    int padInputHeight =
+        (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
+    int padInputWidth =
+        (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
+
+    if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
+      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
+      resizeBuffer<Device>(newSize);
+      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+      if (strideH() == 1) {
+        neon::Padding<float>::run(inputData,
+                                  inputPadding,
+                                  batchSize * inputChannels,
+                                  inputHeight,
+                                  inputWidth,
+                                  padInputHeight,
+                                  padInputWidth);
+      } else if (strideH() == 2) {
+        neon::StridePadding::run(inputData,
+                                 inputPadding,
+                                 batchSize * inputChannels,
+                                 inputHeight,
+                                 inputWidth,
+                                 padInputHeight,
+                                 padInputWidth);
+      } else {
+        LOG(FATAL) << "Not supported";
+      }
+    }
+
+    std::function<void(
+        const float*, const float*, int, int, int, int, int, int, float*)>
+        DepthWiseConv;
+
+    if (filterWidth == 3) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
+    } else if (filterWidth == 4) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
+    } else {
+      LOG(FATAL) << "Not supported";
+    }
+
+    for (int i = 0; i < batchSize; i++) {
+      DepthWiseConv(inputPadding,
+                    filterData,
+                    padInputHeight,
+                    padInputWidth,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    filterMultiplier,
+                    outputData);
+      inputPadding += inputChannels * padInputHeight * padInputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifndef PADDLE_TYPE_DOUBLE
+
+REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
+                    CPU,
+                    NeonDepthwiseConvTransposeFunction);
+
+#endif
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/neon/neon_util.h b/paddle/legacy/function/neon/neon_util.h
similarity index 100%
rename from paddle/function/neon/neon_util.h
rename to paddle/legacy/function/neon/neon_util.h
diff --git a/paddle/legacy/function/nnpack/NNPACKConvOp.cpp b/paddle/legacy/function/nnpack/NNPACKConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81c832e7747f8e75d322891476e08dacc435f5d4
--- /dev/null
+++ b/paddle/legacy/function/nnpack/NNPACKConvOp.cpp
@@ -0,0 +1,247 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "nnpack.h"
+#include "paddle/legacy/function/ConvOp.h"
+
+DEFINE_bool(nnpack_allocate_outside,
+            true,
+            "Allocate and free workspace memory outside the NNPACK interface.");
+DEFINE_int32(nnpack_num_threads,
+             0,
+             "The number of nnpack threads"
+             "default: 0; 0 to disable threadpool.");
+
+namespace paddle {
+
+nnp_convolution_algorithm get_nnp_convolution_algorithm(
+    const std::string& algorithm) {
+  if (algorithm == "auto") {
+    return nnp_convolution_algorithm_auto;
+  } else if (algorithm == "ft8x8") {
+    return nnp_convolution_algorithm_ft8x8;
+  } else if (algorithm == "ft16x16") {
+    return nnp_convolution_algorithm_ft16x16;
+  } else if (algorithm == "wt8x8") {
+    return nnp_convolution_algorithm_wt8x8;
+  } else if (algorithm == "implicit-gemm") {
+    return nnp_convolution_algorithm_implicit_gemm;
+  } else if (algorithm == "direct") {
+    return nnp_convolution_algorithm_direct;
+  } else {
+    return nnp_convolution_algorithm_auto;
+  }
+}
+
+template <DeviceType Device>
+class NNPACKConvFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+    algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
+    transform_strategy_ = nnp_convolution_transform_strategy_compute;
+    nnp_status status = nnp_initialize();
+    CHECK_EQ(status, nnp_status_success);
+    workspaceBuffer_ = nullptr;
+    workspaceSize_ = 0;
+
+    create_nnpack_threadpool();
+  }
+
+  ~NNPACKConvFunction() {
+    if (workspaceBuffer_) {
+      free(workspaceBuffer_);
+    }
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    check(inputs, outputs);
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
+    nnp_padding padding = {.top = (size_t)paddingH(),
+                           .right = (size_t)paddingW(),
+                           .bottom = (size_t)paddingH(),
+                           .left = (size_t)paddingW()};
+    nnp_size kernelSize = {.width = filterWidth, .height = filterHeight};
+    nnp_size outputSubsampling = {.width = (size_t)strideW(),
+                                  .height = (size_t)strideH()};
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    void* bufferPtr = nullptr;
+    size_t* sizePtr = nullptr;
+    size_t needSize;
+    if (FLAGS_nnpack_allocate_outside) {
+      if (batchSize == 1) {
+        nnp_status status = nnp_convolution_inference(algorithm_,
+                                                      transform_strategy_,
+                                                      inputChannels,
+                                                      outputChannels,
+                                                      inputSize,
+                                                      padding,
+                                                      kernelSize,
+                                                      outputSubsampling,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      &needSize,
+                                                      nnp_activation_identity,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      } else {
+        // only supports stride = 1
+        CHECK_EQ(strideH(), 1);
+        CHECK_EQ(strideW(), 1);
+        nnp_status status = nnp_convolution_output(algorithm_,
+                                                   batchSize,
+                                                   inputChannels,
+                                                   outputChannels,
+                                                   inputSize,
+                                                   padding,
+                                                   kernelSize,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   &needSize,
+                                                   nnp_activation_identity,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      }
+
+      VLOG(3) << "workspace size is " << needSize;
+      if (needSize > workspaceSize_) {
+        workspaceSize_ = needSize;
+        if (workspaceBuffer_) {
+          free(workspaceBuffer_);
+        } else {
+          posix_memalign(&workspaceBuffer_, 64, needSize);
+        }
+      }
+
+      if (needSize) {
+        bufferPtr = workspaceBuffer_;
+        sizePtr = &needSize;
+      }
+    }
+
+    size_t inputOffset = inputChannels / groups_ * inputHeight * inputWidth;
+    size_t outputOffset = outputChannels / groups_ * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    if (batchSize == 1) {
+      for (size_t g = 0; g < groups_; g++) {
+        nnp_status status =
+            nnp_convolution_inference(algorithm_,
+                                      transform_strategy_,
+                                      inputChannels / groups_,
+                                      outputChannels / groups_,
+                                      inputSize,
+                                      padding,
+                                      kernelSize,
+                                      outputSubsampling,
+                                      inputData + inputOffset * g,
+                                      filterData + filterOffset * g,
+                                      nullptr, /* bias */
+                                      outputData + outputOffset * g,
+                                      bufferPtr,
+                                      sizePtr,
+                                      nnp_activation_identity,
+                                      nullptr,
+                                      threadpool_, /* threadpool */
+                                      nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      }
+    } else {
+      // only supports stride = 1
+      CHECK_EQ(strideH(), 1);
+      CHECK_EQ(strideW(), 1);
+
+      // TODO(hedaoyuan): There has some bug when batchSize > 1 and groups_ > 1.
+      CHECK_EQ(groups_, static_cast<size_t>(1));
+      nnp_status status = nnp_convolution_output(algorithm_,
+                                                 batchSize,
+                                                 inputChannels,
+                                                 outputChannels,
+                                                 inputSize,
+                                                 padding,
+                                                 kernelSize,
+                                                 inputData,
+                                                 filterData,
+                                                 nullptr, /* bias */
+                                                 outputData,
+                                                 bufferPtr,
+                                                 sizePtr,
+                                                 nnp_activation_identity,
+                                                 nullptr,
+                                                 threadpool_, /* threadpool */
+                                                 nullptr);
+      CHECK_EQ(status, nnp_status_success);
+    }
+  }
+
+  static void create_nnpack_threadpool() {
+    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
+
+ private:
+  nnp_convolution_algorithm algorithm_;
+  nnp_convolution_transform_strategy transform_strategy_;
+  void* workspaceBuffer_;
+  size_t workspaceSize_;
+  static pthreadpool_t threadpool_;
+};
+
+template <DeviceType Device>
+pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
+
+REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp b/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2db83f5a36310ca6f173d6e6501118b34060761
--- /dev/null
+++ b/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/function/ConvOpTest.h"
+
+namespace paddle {
+
+TEST(NNPACK, Forward) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NNPACKConv-CPU", forward);
+}
+
+TEST(NNPACK, Depthwise) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NNPACKConv-CPU", forward);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/legacy/gserver/CMakeLists.txt
similarity index 100%
rename from paddle/gserver/CMakeLists.txt
rename to paddle/legacy/gserver/CMakeLists.txt
diff --git a/paddle/legacy/gserver/activations/ActivationFunction.cpp b/paddle/legacy/gserver/activations/ActivationFunction.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..69f34db5ac193664e13846835ffb4bd7f579e028
--- /dev/null
+++ b/paddle/legacy/gserver/activations/ActivationFunction.cpp
@@ -0,0 +1,509 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ActivationFunction.h"
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <thread>
+#include <type_traits>
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Logging.h"
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "MKLDNNActivation.h"
+#endif
+
+namespace paddle {
+
+static ClassRegistrar<ActivationFunction> gActivationRegistrar;
+/**
+ * @def ACTIVATION_CLASS_NAME
+ * @brief Macro for getting derived activation class name
+ * @note ACTIVATION_CLASS_NAME(softmax) softmax_;
+ * means softmaxActivation softmax_;
+ */
+#define ACTIVATION_CLASS_NAME(ACTIVATION_NAME) ACTIVATION_NAME##Activation
+/**
+ * @def BEGIN_DEFINE_ACTIVATION
+ * @brief Macro for defining a devried activation class
+ */
+#define BEGIN_DEFINE_ACTIVATION(ACTIVATION_NAME)                             \
+  class ACTIVATION_CLASS_NAME(ACTIVATION_NAME) : public ActivationFunction { \
+   private:                                                                  \
+    static const std::string name;                                           \
+                                                                             \
+   public:                                                                   \
+    const std::string& getName() const { return name; }
+/**
+ * @def END_DEFINE_ACTIVATION
+ * @brief Macro for registering a derived activation class
+ */
+#define END_DEFINE_ACTIVATION(ACTIVATION_NAME)                     \
+  }                                                                \
+  ;                                                                \
+  const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
+      #ACTIVATION_NAME;                                            \
+  static InitFunction __reg_activation__##ACTIVATION_NAME([] {     \
+    gActivationRegistrar                                           \
+        .registerClass<ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(    \
+            #ACTIVATION_NAME);                                     \
+  });
+
+/**
+ * @brief The IdentityActivation class
+ *
+ * Do nothing when forward/backward.
+ */
+class IdentityActivation : public ActivationFunction {
+ public:
+  static const std::string name;
+  Error __must_check forward(Argument& act) {
+    (void)act;
+    return Error();
+  }
+  Error __must_check backward(Argument& act) {
+    (void)act;
+    return Error();
+  }
+  const std::string& getName() const { return name; }
+};
+const std::string IdentityActivation::name = "";
+static InitFunction __reg_activation__identity([] {
+  gActivationRegistrar.registerClass<IdentityActivation>("");
+  gActivationRegistrar.registerClass<IdentityActivation>("linear");
+});
+
+/**
+ * @brief Sigmoid Activation
+ * \f[
+ * f(z) = \frac{1}{1+exp(-z)}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(sigmoid)
+Error __must_check forward(Argument& act) {
+  act.value->sigmoid(*act.value);
+  return Error();
+}
+Error __must_check backward(Argument& act) {
+  act.grad->sigmoidDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(sigmoid)
+
+/**
+ * @brief Softmax Activation
+ * \f[
+ * P(y=j|x) = \frac{e^{x^Tw_j}}{\sum^K_{k=1}e^{x^Tw_k}}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softmax)
+private:
+MatrixPtr sftMaxSum_;
+MatrixPtr sftMaxDot_;
+
+public:
+Error __must_check forward(Argument& act) {
+  act.value->softmax(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  MatrixPtr outputV = act.value;
+  MatrixPtr outputG = act.grad;
+
+  if (outputG->useGpu()) {
+    outputG->softmaxBackward(*outputV);
+  } else {
+    SetDevice device(act.deviceId);
+    Matrix::resizeOrCreate(sftMaxDot_,
+                           outputG->getHeight(),
+                           outputG->getWidth(),
+                           /* trans */ false,
+                           useGpu(act.deviceId));
+    Matrix::resizeOrCreate(sftMaxSum_,
+                           outputG->getHeight(),
+                           1,
+                           /* trans */ false,
+                           useGpu(act.deviceId));
+
+    sftMaxDot_->dotMul(*outputG, *outputV);
+    sftMaxSum_->colMerge(*sftMaxDot_);
+
+    act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
+  }
+  return Error();
+}
+END_DEFINE_ACTIVATION(softmax)
+
+/**
+ * @brief Sequence_softmax Activation
+ * @note Softmax on all frames of one sequence.
+ * Width of frame must be one.
+ */
+BEGIN_DEFINE_ACTIVATION(sequence_softmax)
+private:
+ACTIVATION_CLASS_NAME(softmax) softmax_;
+Argument argument_;
+
+public:
+Error __must_check forward(Argument& act) {
+  if (act.value->getWidth() != 1UL) {
+    return Error(
+        "Input width for each timestep of sequence softmax should be 1");
+  }
+
+  if (!argument_.value) {
+    argument_.value = Matrix::create(nullptr,
+                                     /* height= */ 1,
+                                     1,
+                                     /* trans= */ false,
+                                     useGpu(act.deviceId));
+    argument_.grad = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    1,
+                                    /* trans= */ false,
+                                    useGpu(act.deviceId));
+  }
+
+  auto starts =
+      act.hasSubseq()
+          ? act.subSequenceStartPositions->getVector(useGpu(act.deviceId))
+          : act.sequenceStartPositions->getVector(useGpu(act.deviceId));
+  act.value->sequenceSoftmax(*act.value, *starts);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  if (act.value->getWidth() != 1UL) {
+    return Error(
+        "Input width for each timestep of sequence softmax should be 1");
+  }
+
+  size_t numSequences =
+      act.hasSubseq() ? act.getNumSubSequences() : act.getNumSequences();
+  const int* starts = act.getCpuStartPositions();
+
+  for (size_t i = 0; i < numSequences; ++i) {
+    // TODO(Dangqingqing) optimization for GPU
+    size_t offset = starts[i];
+    size_t size = starts[i + 1] - starts[i];
+    argument_.value->setData(act.value->getData() + offset, 1UL, size);
+    argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
+
+    Error err = softmax_.backward(argument_);
+    if (!err.isOK()) return err;
+  }
+  return Error();
+}
+END_DEFINE_ACTIVATION(sequence_softmax)
+
+/*
+ * @brief SoftSign Activation.
+ * \f[
+ * f(z) = \frac{z}{1 + |z|}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softsign)
+private:
+MatrixPtr denominator_;
+
+Error __must_check forward(Argument& act) {
+  size_t height = act.value->getHeight();
+  size_t width = act.value->getWidth();
+  Matrix::resizeOrCreate(
+      denominator_, height, width, false, useGpu(act.deviceId));
+  denominator_->assign(*act.value);
+  denominator_->abs2();
+  denominator_->add(1.);
+
+  act.value->dotDiv(*act.value, *denominator_);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  denominator_->square2();
+  denominator_->scalarDiv(*denominator_, 1.);
+  act.grad->dotMul(*act.grad, *denominator_);
+  return Error();
+}
+END_DEFINE_ACTIVATION(softsign)
+
+/**
+ * @brief Relu Activation.
+ * forward. y = max(0, z)
+ *
+ * derivative of relu is:
+ *
+ *    1 if z > 0
+ *
+ *    0 otherwise.
+ */
+BEGIN_DEFINE_ACTIVATION(relu)
+Error __must_check forward(Argument& act) {
+  act.value->relu(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->reluDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(relu)
+
+/**
+ * @brief BRelu Activation.
+ *
+ * forward. y = min(24, max(0, z))
+ *
+ * derivative of brelu is:
+ *
+ *    1 if 0 < z < 24
+ *
+ *    0 otherwise.
+ *
+ * TODO(yuyang18): Remove magic number 24 or make it configuable.
+ */
+BEGIN_DEFINE_ACTIVATION(brelu)
+Error __must_check forward(Argument& act) {
+  act.value->brelu(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->breluDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(brelu)
+
+/**
+ * @brief Tanh Activation.
+ * \f[
+ * f(z) = tanh(z)=\frac{e^z-e^{-z}}{e^z+e^{-z}}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(tanh)
+Error __must_check forward(Argument& act) {
+  act.value->tanh(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->tanhDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(tanh)
+
+/**
+ * @brief Scaled Tanh Activation
+ * \f[
+ * f(z) = 1.7159 * tanh(2/3*z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(stanh)
+private:
+real a, b;
+
+public:
+ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
+Error __must_check forward(Argument& act) {
+  act.value->scaledTanh(*act.value, a, b);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->scaledTanhDerivative(*act.value, a, b);
+  return Error();
+}
+END_DEFINE_ACTIVATION(stanh)
+
+/**
+ * @brief Soft Relu Activation.
+ * \f[
+ * f(z) = ln(1+e^z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softrelu)
+Error __must_check forward(Argument& act) {
+  act.value->softrelu(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->softreluDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(softrelu)
+
+/**
+ * @brief Abs Activation.
+ * Forward: f(z) = abs(z)
+ *
+ * Derivative:
+ *
+ *     1   if z>0
+ *
+ *    -1   if z<0
+ *
+ *     0   if z=0
+ */
+BEGIN_DEFINE_ACTIVATION(abs)
+Error __must_check forward(Argument& act) {
+  SetDevice device(act.deviceId);
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
+
+  act.in->copyFrom(*act.value);
+  act.value->abs2(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->absDerivative(*act.in);
+  return Error();
+}
+END_DEFINE_ACTIVATION(abs)
+
+/**
+ * @brief Square Activation.
+ * \f[
+ * f(z) = z^2.
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(square)
+Error __must_check forward(Argument& act) {
+  SetDevice device(act.deviceId);
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
+
+  act.in->copyFrom(*act.value);
+  act.value->square2(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->squareDerivative(*act.in);
+  return Error();
+}
+END_DEFINE_ACTIVATION(square)
+
+/**
+ * @brief Exponential Activation.
+ * \f[
+ * f(z) = e^z
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(exponential)
+Error __must_check forward(Argument& act) {
+  act.value->exp2(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->expDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(exponential)
+
+/**
+ * @brief Reciprocal Activation.
+ * \f[
+ * f(z) = 1/z
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(reciprocal)
+Error __must_check forward(Argument& act) {
+  act.value->reciprocal2();
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->dotMulSquare(*act.value);
+  act.grad->neg();
+  return Error();
+}
+END_DEFINE_ACTIVATION(reciprocal)
+
+/**
+ * @brief Square Root Activation.
+ * \f[
+ * f(z) = sqrt(z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(sqrt)
+Error __must_check forward(Argument& act) {
+  act.value->sqrt2();
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->dotDiv(*act.grad, *act.value);
+  act.grad->mulScalar(0.5);
+  return Error();
+}
+END_DEFINE_ACTIVATION(sqrt)
+
+/**
+ * @brief Logarithm Activation.
+ * \f[
+ * f(z) = log(z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(log)
+Error __must_check forward(Argument& act) {
+  SetDevice device(act.deviceId);
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
+
+  act.in->copyFrom(*act.value);
+  act.value->log2(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->dotDiv(*act.grad, *act.in);
+  return Error();
+}
+END_DEFINE_ACTIVATION(log)
+
+ActivationFunction* ActivationFunction::create(const std::string& type) {
+#ifdef PADDLE_WITH_MKLDNN
+  if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
+    return MKLDNNActivation::create(type);
+  }
+#endif
+
+  return gActivationRegistrar.createByType(type);
+}
+
+std::vector<std::string> ActivationFunction::getAllRegisteredTypes() {
+  std::vector<std::string> types;
+  gActivationRegistrar.forEachType(
+      [&](const std::string& type) { types.push_back(type); });
+  return types;
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/legacy/gserver/activations/ActivationFunction.h
similarity index 100%
rename from paddle/gserver/activations/ActivationFunction.h
rename to paddle/legacy/gserver/activations/ActivationFunction.h
diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/legacy/gserver/activations/MKLDNNActivation.cpp
similarity index 100%
rename from paddle/gserver/activations/MKLDNNActivation.cpp
rename to paddle/legacy/gserver/activations/MKLDNNActivation.cpp
diff --git a/paddle/legacy/gserver/activations/MKLDNNActivation.h b/paddle/legacy/gserver/activations/MKLDNNActivation.h
new file mode 100644
index 0000000000000000000000000000000000000000..59c447ad07398c0b6ca7d78766dd533963744d1b
--- /dev/null
+++ b/paddle/legacy/gserver/activations/MKLDNNActivation.h
@@ -0,0 +1,119 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "ActivationFunction.h"
+#include "mkldnn.hpp"
+#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
+#include "paddle/legacy/math/MKLDNNMatrix.h"
+#include "paddle/legacy/parameter/Argument.h"
+
+namespace paddle {
+
+/**
+ * @brief Base class of MKLDNN Activation.
+ * Common activation function are provieded,
+ * including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
+ */
+class MKLDNNActivation : public ActivationFunction {
+ protected:
+  // input value element count
+  size_t cnt_;
+  // should not merge the resetBwd into resetFwd,
+  // because the grad data would be changing before backward.
+  bool needResetBwd_;
+  // mkldnn matrix, primitive, stream and pipeline
+  MKLDNNMatrixPtr val_;
+  MKLDNNMatrixPtr grad_;
+  std::shared_ptr<mkldnn::engine> engine_;
+  std::shared_ptr<MKLDNNStream> stream_;
+  std::shared_ptr<mkldnn::primitive> fwd_;
+  std::shared_ptr<mkldnn::primitive> bwd_;
+  std::vector<mkldnn::primitive> pipelineFwd_;
+  std::vector<mkldnn::primitive> pipelineBwd_;
+
+ public:
+  MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
+  ~MKLDNNActivation() {}
+  static ActivationFunction* create(const std::string& type);
+  static std::vector<std::string> getAllRegisteredTypes();
+  virtual const std::string& getName() const = 0;
+  /**
+   * reset the forward primitives
+   */
+  virtual void resetFwd(Argument& act);
+  /**
+   * reset the backward primitives,
+   * can not merge this functions into resetFwd as the grad data
+   * would be changing before backward.
+   */
+  virtual void resetBwd(Argument& act) {}
+  virtual Error __must_check forward(Argument& act);
+  virtual Error __must_check backward(Argument& act);
+};
+
+/**
+ * @brief Base class of MKLDNN Eltwise Activation,
+ * includes mkldnn_relu, mkldnn_elu and mkldnn_tanh.
+ */
+class MKLDNNEltwiseActivation : public MKLDNNActivation {
+  typedef mkldnn::eltwise_forward eltwise_fwd;
+  typedef mkldnn::eltwise_backward eltwise_bwd;
+  typedef mkldnn::algorithm algorithm;
+
+ protected:
+  // save the forward primitive desc, which can be used backward
+  std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
+  // eltwise_bwd need src input value
+  MKLDNNMatrixPtr inVal_;
+  // use for copy data
+  std::shared_ptr<mkldnn::reorder> copyInVal_;
+
+ public:
+  MKLDNNEltwiseActivation() {}
+  ~MKLDNNEltwiseActivation() {}
+  virtual const std::string& getName() const = 0;
+
+  // in common, the alpha of forward and backward should be equal.
+  // but for relu, to avoid negative value, they should be opposite
+  virtual float getAlpha() const = 0;
+  virtual float getBwdAlpha() const = 0;
+  virtual float getBeta() const { return 0.f; }
+  virtual algorithm getAlgo(std::string type) const;
+  void resetFwd(Argument& act) override;
+  void resetBwd(Argument& act) override;
+};
+
+/**
+ * @brief Base class of MKLDNN softmax Activation,
+ * only have mkldnn forward, use cpu implement for backward.
+ */
+class MKLDNNSoftmaxActivation : public MKLDNNActivation {
+  typedef mkldnn::softmax_forward softmax_fwd;
+
+ private:
+  // for backward
+  MatrixPtr sftMaxSum_;
+  MatrixPtr sftMaxDot_;
+
+ public:
+  MKLDNNSoftmaxActivation() {}
+  ~MKLDNNSoftmaxActivation() {}
+  virtual const std::string& getName() const = 0;
+  void resetFwd(Argument& act) override;
+  Error __must_check forward(Argument& act) override;
+  Error __must_check backward(Argument& act) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/legacy/gserver/dataproviders/DataProvider.cpp
similarity index 100%
rename from paddle/gserver/dataproviders/DataProvider.cpp
rename to paddle/legacy/gserver/dataproviders/DataProvider.cpp
diff --git a/paddle/legacy/gserver/dataproviders/DataProvider.h b/paddle/legacy/gserver/dataproviders/DataProvider.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6f74afed05e0b42b3a4ec26041bcb8fa50fa9b2
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/DataProvider.h
@@ -0,0 +1,480 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include "DataConfig.pb.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Common.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Queue.h"
+#include "paddle/utils/ThreadLocal.h"
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+/**
+ * @def REGISTER_DATA_PROVIDER
+ * @brief Macro for registering a data provider. The class type should contain
+ *        a consturctor with parameter (DataConfig, bool).
+ */
+#define REGISTER_DATA_PROVIDER(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                    \
+    DataProvider::registrar_.registerClass(                              \
+        #__type_name,                                                    \
+        [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
+          DataProvider* dp = new __class_name(conf, useGpu);             \
+          return dp;                                                     \
+        });                                                              \
+  })
+
+/**
+ * @def REGISTER_DATA_PROVIDER_EX
+ * @brief Macro for registering a data provider, which contains a constructor
+ *        with parameter (DataConfig, ModelConfig, bool).
+ */
+#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
+  static InitFunction __reg_type_##__type_name([] {                     \
+    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+
+class DataBatch;
+class BufferBatch;
+typedef std::shared_ptr<DataBatch> DataBatchPtr;
+typedef std::shared_ptr<BufferBatch> BufferBatchPtr;
+/**
+ * @brief Data for batch training a neural network
+ */
+class DataBatch {
+ public:
+  DataBatch() : size_(0) { data_.clear(); }
+  /**
+   * @brief Get batch size
+   * @return batch size
+   */
+  int64_t getSize() const { return size_; }
+  /**
+   * @brief Get num of sequences of sequence data
+   * @return num of sequences
+   */
+  int64_t getNumSequences() const {
+    if (data_.empty()) return size_;
+    return data_[0].sequenceStartPositions
+               ? data_[0].sequenceStartPositions->getSize() - 1
+               : size_;
+  }
+  /**
+   * @brief Set batch size
+   * @param[in] size size
+   */
+  void setSize(int64_t size) { size_ = size; }
+  /**
+   * @brief Get size of argument vector
+   * @return size of argument vector
+   * @note For usual supervised learning, input data and label is needed,
+   * then there will be two argument.
+   */
+  int64_t getNumStreams() const { return data_.size(); }
+
+  /**
+   * @brief Get a argument with index i
+   * @param[in] i index in argument vector
+   * @return a argument with index i
+   */
+  const Argument& getStream(int i) const { return data_[i]; }
+  /**
+   * @brief Get all argument
+   * @return an argument vector
+   */
+  std::vector<Argument>& getStreams() { return data_; }
+  /**
+   * @brief Get all argument const
+   * @return an argument vector
+   */
+  std::vector<Argument> getStreams() const { return data_; }
+  /**
+   * @brief Clear DataBatch
+   */
+  void clear() {
+    data_.clear();
+    size_ = 0;
+  }
+
+  /**
+   * @brief Append data to DataBatch
+   * @param[in] data  matrix data
+   * @note The order in which each data stream is appended must match the order
+   * specified in stream_names of DataConfig. The stream_names can be obtained
+   * using DataProvider::getStreamNames().
+   */
+  void appendData(MatrixPtr data) {
+    Argument argu;
+    argu.value = data;
+    data_.push_back(argu);
+  }
+
+  /**
+   * @brief Append sequence data to DataBatch
+   * @param[in] data                      matrix data
+   * @param[in] sequenceStartPositions    sequence data
+   * @note The order in which each data stream is appended must match the order
+   * specified in stream_names of DataConfig. The stream_names can be obtained
+   * using DataProvider::getStreamNames().
+   */
+  void appendData(const MatrixPtr& data,
+                  const ICpuGpuVectorPtr& sequenceStartPositions) {
+    Argument argu;
+    argu.value = data;
+    argu.sequenceStartPositions = sequenceStartPositions;
+    data_.push_back(argu);
+  }
+  /**
+   * @brief Append label data
+   * @param[in]  label    label data
+   * @param[in]  value    matrix data, default null
+   */
+  void appendLabel(IVectorPtr label, MatrixPtr value = nullptr) {
+    Argument argu;
+    argu.ids = label;
+    argu.value = value;
+    data_.push_back(argu);
+  }
+
+  /*
+   * @brief Append argument
+   * @param[in]  argus   DataBatch.getStreams()
+   * @param[in]  size    DataBatch.getSize()
+   * @param[in]  dataId  sub dataprovider id (in MultiDataProvider)
+   */
+  void appendArguments(const std::vector<Argument>& argus,
+                       int size,
+                       int dataId) {
+    size_ += size;
+    for (const auto& argu : argus) {
+      data_.push_back(argu);
+      data_.back().dataId = dataId;
+    }
+  }
+
+ protected:
+  /**
+   * @brief batch size
+   */
+  int64_t size_;
+  /**
+   * @brief A batch data consist of a Argument vector,
+   * An argument corresponds to a type of input data.
+   */
+  std::vector<Argument> data_;
+};
+
+class BufferBatch {
+ public:
+  BufferBatch() {
+    hlStream_ = HPPL_STREAM_DEFAULT;
+    hlEvent_ = NULL;
+    batchData_ = NULL;
+  }
+  ~BufferBatch() {
+    if (hlEvent_) {
+      hl_destroy_event(hlEvent_);
+      hlEvent_ = NULL;
+    }
+    delete batchData_;
+    batchData_ = NULL;
+  }
+
+  void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
+  DataBatch* getDataBatch() { return batchData_; }
+
+  void setCuStream(hl_stream_t stream) { hlStream_ = stream; }
+  hl_stream_t getCuStream() const { return hlStream_; }
+
+  void setCuEvent(hl_event_t event) { hlEvent_ = event; }
+
+  hl_event_t getCuEvent() const { return hlEvent_; }
+
+  void createCuEvent() {
+    if (!hlEvent_) {
+      hlStream_ = HPPL_STREAM_1;
+      hl_create_event(&hlEvent_);
+    }
+  }
+
+  void syncEvent() {
+    if (hlEvent_) {
+      hl_stream_wait_event(hlStream_, hlEvent_);
+    }
+  }
+
+  void swap(BufferBatch* bufBatch);
+  void clone(DataBatch* srcBatch, bool useGpu);
+
+ protected:
+  DataBatch* batchData_;
+  hl_stream_t hlStream_;
+  hl_event_t hlEvent_;
+};
+
+class DataProvider;
+typedef std::shared_ptr<DataProvider> DataProviderPtr;
+
+typedef Queue<BufferBatch*> BufferBatchQueue;
+
+class DoubleBuffer {
+ public:
+  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
+  virtual ~DoubleBuffer();
+  void removeOneBatch(DataBatch* dataBatch);
+
+  void setBatchSize(int64_t newBatchSize) { batchSize_ = newBatchSize; }
+
+  int64_t getBatchSize() { return batchSize_; }
+
+  void startAsyncLoad();
+  void finishAsyncLoad() {
+    stopping_ = true;
+    taskReadySem_.post();
+    if (asyncLoader_) {
+      asyncLoader_->join();
+    }
+  }
+
+  void setPending(bool pending) { pending_ = pending; }
+
+ protected:
+  virtual void asyncLoadBatch();
+  void insertOneBatch(DataBatch* batch);
+
+  DataProvider* dataPool_;
+  bool useGpu_;
+  int32_t batchSize_;
+  ThreadLocal<BufferBatchPtr> usingBatch_;
+  BufferBatchQueue* dataQueue_;
+  BufferBatchQueue* bufferQueue_;
+  std::unique_ptr<std::thread> asyncLoader_;
+  Semaphore taskReadySem_;
+  bool stopping_;
+  bool pending_;
+};
+
+/**
+ * @brief Base class for DataProvider, which supplies data for training
+ * @note It can supplies multiple streams of data.
+ * For typical supervised training, there are two streams:
+ * one is for input, one is for label.
+ */
+class DataProvider {
+ public:
+  static ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool> registrar_;
+  static DataProvider* create(const DataConfig& config,
+                              const ModelConfig& modelConfig,
+                              bool useGpu = FLAGS_use_gpu);
+
+  /**
+   * @brief create only used for unittest.
+   */
+  inline static DataProvider* create(const DataConfig& config,
+                                     bool useGpu = FLAGS_use_gpu) {
+    return create(config, ModelConfig(), useGpu);
+  }
+
+  DataProvider(const DataConfig& config, bool useGpu)
+      : config_(config),
+        skipShuffle_(false),
+        usageRatio_(config.usage_ratio()),
+        useGpu_(useGpu) {
+    if (config_.async_load_data()) {
+      initAsyncLoader();
+    }
+  }
+  virtual ~DataProvider() {}
+
+  const DataConfig& getConfig() const { return config_; }
+
+  void setSkipShuffle() { skipShuffle_ = true; }
+
+  /**
+   * @brief Get next batch of training samples
+   * @param[in]    size    size of training samples to get
+   * @param[out]   batch   a batch of training samples
+   * @return actual size of obtained training samples
+   */
+  int64_t getNextBatch(int64_t size, DataBatch* batch);
+
+  /**
+   * @brief Shuffle the data set
+   */
+  virtual void shuffle() = 0;
+
+  /**
+   * @brief reset all the value of index
+   * @note reset() must be called before any calls to getNextBatch()
+   * IMPORTANT: subclass reset() should always call the base class reset()
+   * at the end of the function
+   */
+  virtual void reset() {
+    if (doubleBuffer_ != nullptr) {
+      doubleBuffer_->startAsyncLoad();
+    }
+  }
+
+  /**
+   * @brief Get the size of training samples
+   * @return the number of training samples in the data set.
+   * @note return -1 to indicate unlimited number of samples.
+   */
+  virtual int64_t getSize() = 0;
+
+  /**
+   * @brief Get next batch training samples internally
+   * @param[in]    size      size of training samples to get
+   * @param[out]   batch     a batch of training samples
+   * @return actual size of obtained training samples
+   */
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;
+
+ protected:
+  DataConfig config_;
+  bool skipShuffle_;
+  float usageRatio_;
+  bool useGpu_;
+  std::unique_ptr<DoubleBuffer> doubleBuffer_;
+  ThreadLocal<std::vector<MatrixPtr>> constantSlots_;
+  /**
+   * @@brief Get next batch training samples from buffer
+   * @param[in]    size      size of training samples to get
+   * @param[out]   batch     a batch of training samples
+   * @return actual size of obtained training samples
+   */
+  int64_t getNextBatchFromBuffer(int64_t size, DataBatch* batch);
+
+  void initAsyncLoader();
+};
+
+/**
+ * A data provider which does nothing. It only serves as providing
+ * necessary configurations such as stream_names
+ */
+class DummyDataProvider : public DataProvider {
+ public:
+  DummyDataProvider(const DataConfig& config, bool useGpu)
+      : DataProvider(config, useGpu) {}
+  virtual void shuffle() {}
+  virtual void reset() { DataProvider::reset(); }
+  virtual int64_t getSize() { return 0; }
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) {
+    (void)size;
+    (void)batch;
+    return 0;
+  }
+};
+
+/**
+ * Data provider for one input and one integer label.
+ */
+class SimpleDataProviderBase : public DataProvider {
+ protected:
+  /// sample feature dimension
+  int64_t sampleDim_;
+  /// the number of samples
+  int64_t bufferCapacity_;
+  int64_t sampleNumInBuf_;
+  /// next item to read in buffer
+  int64_t nextItemIndex_;
+  /// some user defined info for validation
+  bool withInfo_;
+
+  /// data buffer: bufferCapacity_ * nDataDim_
+  CpuMatrixPtr hInputDataBuf_;
+
+  /// label buffer:bufferCapacity_ * 1
+  CpuIVectorPtr hInputLabelBuf_;
+
+  /// info buffer:bufferCapacity_ * 1
+  CpuIVectorPtr hInputInfoBuf_;
+
+  ThreadLocal<MatrixPtr> dataBatch_;
+  ThreadLocal<IVectorPtr> labelBatch_;
+  ThreadLocal<IVectorPtr> infoBatch_;
+
+  RWLock lock_;
+
+ public:
+  SimpleDataProviderBase(const DataConfig& config, bool useGpu, bool withInfo);
+  ~SimpleDataProviderBase() {}
+
+  void shuffle();
+
+  virtual void reset();
+
+  virtual int64_t getSize();
+
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
+
+  /// return the number of samples in the buffer
+  int64_t fillBuffer();
+
+ protected:
+  /**
+   * @brief Fill at most size samples into data and label.
+   *
+   * Each input is stored in contiguous memory locations in data.
+   *
+   * data[n * sampleDim_] .. data[n * sampleDim_ + sampleDim_ - 1] is for
+   * the input of the n-th sample.
+   *
+   * label[n] is the label for the n-th sample.
+   */
+  virtual int64_t fillBufferImp(real* data,
+                                int* label,
+                                int* info,
+                                int64_t size) = 0;
+};
+
+class SimpleDataProvider : public SimpleDataProviderBase {
+ public:
+  SimpleDataProvider(const DataConfig& config, bool useGpu);
+  ~SimpleDataProvider();
+  virtual void reset();
+
+ protected:
+  void loadData(const std::string& fileName);
+  void loadDataFile(const std::string& fileName);
+  virtual int64_t fillBufferImp(real* data,
+                                int* label,
+                                int* info,
+                                int64_t size);
+
+ protected:
+  size_t currentSampleIndex_;
+  std::vector<int> labels_;
+  std::vector<real> data_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/legacy/gserver/dataproviders/DataProviderGroup.h
similarity index 100%
rename from paddle/gserver/dataproviders/DataProviderGroup.h
rename to paddle/legacy/gserver/dataproviders/DataProviderGroup.h
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
similarity index 100%
rename from paddle/gserver/dataproviders/MultiDataProvider.cpp
rename to paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/legacy/gserver/dataproviders/MultiDataProvider.h
similarity index 100%
rename from paddle/gserver/dataproviders/MultiDataProvider.h
rename to paddle/legacy/gserver/dataproviders/MultiDataProvider.h
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/legacy/gserver/dataproviders/ProtoReader.h
similarity index 100%
rename from paddle/gserver/dataproviders/ProtoReader.h
rename to paddle/legacy/gserver/dataproviders/ProtoReader.h
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
similarity index 100%
rename from paddle/gserver/dataproviders/PyDataProvider.cpp
rename to paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/legacy/gserver/dataproviders/PyDataProvider.h
similarity index 100%
rename from paddle/gserver/dataproviders/PyDataProvider.h
rename to paddle/legacy/gserver/dataproviders/PyDataProvider.h
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
similarity index 100%
rename from paddle/gserver/dataproviders/PyDataProvider2.cpp
rename to paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
diff --git a/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..04335dc7cdd0919d8e24518f5e9992f10f8b204c
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -0,0 +1,320 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Evaluator.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/utils/StringUtil.h"
+
+namespace paddle {
+
+/**
+ * calculate sequence-to-sequence edit distance
+ */
+class CTCErrorEvaluator : public Evaluator {
+ private:
+  MatrixPtr outActivations_;
+  int numTimes_, numClasses_, numSequences_, blank_;
+  real deletions_, insertions_, substitutions_;
+  int seqClassficationError_;
+  mutable std::unordered_map<std::string, real> evalResults_;
+
+  std::vector<int> path2String(const std::vector<int>& path) {
+    std::vector<int> str;
+    str.clear();
+    int prevLabel = -1;
+    for (std::vector<int>::const_iterator label = path.begin();
+         label != path.end();
+         label++) {
+      if (*label != blank_ &&
+          (str.empty() || *label != str.back() || prevLabel == blank_)) {
+        str.push_back(*label);
+      }
+      prevLabel = *label;
+    }
+    return str;
+  }
+
+  std::vector<int> bestLabelSeq() {
+    std::vector<int> path;
+    path.clear();
+    real* acts = outActivations_->getData();
+    for (int i = 0; i < numTimes_; ++i) {
+      path.push_back(std::max_element(acts + i * numClasses_,
+                                      acts + (i + 1) * numClasses_) -
+                     (acts + i * numClasses_));
+    }
+    return path2String(path);
+  }
+
+  /* "sp, dp, ip" is the weighting parameter of "substitution, deletion,
+   * insertion"
+   * in edit-distance error */
+  real stringAlignment(std::vector<int>& gtStr,
+                       std::vector<int>& recogStr,
+                       bool backtrace = true,
+                       real sp = 1.0,
+                       real dp = 1.0,
+                       real ip = 1.0) {
+    std::vector<std::vector<int>> matrix;
+    int substitutions, deletions, insertions;
+    real distance;
+    int n = gtStr.size();
+    int m = recogStr.size();
+
+    if (n == 0) {
+      substitutions = 0;
+      deletions = 0;
+      insertions = m;
+      distance = m;
+    } else if (m == 0) {
+      substitutions = 0;
+      deletions = n;
+      insertions = 0;
+      distance = n;
+    } else {
+      substitutions = 0;
+      deletions = 0;
+      insertions = 0;
+      distance = 0;
+      // initialize the matrix
+      matrix.resize(n + 1);
+      for (int i = 0; i < n + 1; ++i) {
+        matrix[i].resize(m + 1);
+        for (int j = 0; j < m + 1; ++j) {
+          matrix[i][j] = 0;
+        }
+      }
+      for (int i = 0; i < n + 1; ++i) {
+        matrix[i][0] = i;
+      }
+      for (int j = 0; j < m + 1; ++j) {
+        matrix[0][j] = j;
+      }
+
+      // calculate the insertions, substitutions and deletions
+      for (int i = 1; i < n + 1; ++i) {
+        int s_i = gtStr[i - 1];
+        for (int j = 1; j < m + 1; ++j) {
+          int t_j = recogStr[j - 1];
+          int cost = (s_i == t_j) ? 0 : 1;
+          const int above = matrix[i - 1][j];
+          const int left = matrix[i][j - 1];
+          const int diag = matrix[i - 1][j - 1];
+          const int cell = std::min(above + 1, std::min(left + 1, diag + cost));
+          matrix[i][j] = cell;
+        }
+      }
+
+      if (backtrace) {
+        size_t i = n;
+        size_t j = m;
+        substitutions = 0;
+        deletions = 0;
+        insertions = 0;
+
+        while (i != 0 && j != 0) {
+          if (matrix[i][j] == matrix[i - 1][j - 1]) {
+            --i;
+            --j;
+          } else if (matrix[i][j] == matrix[i - 1][j - 1] + 1) {
+            ++substitutions;
+            --i;
+            --j;
+          } else if (matrix[i][j] == matrix[i - 1][j] + 1) {
+            ++deletions;
+            --i;
+          } else {
+            ++insertions;
+            --j;
+          }
+        }
+        while (i != 0) {
+          ++deletions;
+          --i;
+        }
+        while (j != 0) {
+          ++insertions;
+          --j;
+        }
+        int diff = substitutions + deletions + insertions;
+        if (diff != matrix[n][m]) {
+          LOG(ERROR) << "Found path with distance " << diff
+                     << " but Levenshtein distance is " << matrix[n][m];
+        }
+
+        distance = (sp * substitutions) + (dp * deletions) + (ip * insertions);
+      } else {
+        distance = (real)matrix[n][m];
+      }
+    }
+    real maxLen = std::max(m, n);
+    deletions_ += deletions / maxLen;
+    insertions_ += insertions / maxLen;
+    substitutions_ += substitutions / maxLen;
+
+    if (distance != 0) {
+      seqClassficationError_ += 1;
+    }
+
+    return distance / maxLen;
+  }
+
+  real editDistance(
+      real* output, int numTimes, int numClasses, int* labels, int labelsLen) {
+    numTimes_ = numTimes;
+    numClasses_ = numClasses;
+    blank_ = numClasses_ - 1;
+    outActivations_ = Matrix::create(output, numTimes, numClasses);
+    std::vector<int> recogStr, gtStr;
+    recogStr = bestLabelSeq();
+    for (int i = 0; i < labelsLen; ++i) {
+      gtStr.push_back(labels[i]);
+    }
+
+    return stringAlignment(gtStr, recogStr);
+  }
+
+  void storeLocalValues() const {
+    evalResults_["error"] = numSequences_ ? totalScore_ / numSequences_ : 0;
+    evalResults_["deletion_error"] =
+        numSequences_ ? deletions_ / numSequences_ : 0;
+    evalResults_["insertion_error"] =
+        numSequences_ ? insertions_ / numSequences_ : 0;
+    evalResults_["substitution_error"] =
+        numSequences_ ? substitutions_ / numSequences_ : 0;
+    evalResults_["sequence_error"] =
+        (real)seqClassficationError_ / numSequences_;
+  }
+
+ public:
+  CTCErrorEvaluator()
+      : numTimes_(0),
+        numClasses_(0),
+        numSequences_(0),
+        blank_(0),
+        deletions_(0),
+        insertions_(0),
+        substitutions_(0),
+        seqClassficationError_(0) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    CHECK_EQ(arguments.size(), (size_t)2);
+    Argument output, label;
+    output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
+    label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+    CHECK(label.sequenceStartPositions);
+    CHECK(label.ids);
+    size_t numSequences = label.sequenceStartPositions->getSize() - 1;
+    const int* labelStarts = label.sequenceStartPositions->getData(false);
+    const int* outputStarts = output.sequenceStartPositions->getData(false);
+    real totalErr = 0;
+    for (size_t i = 0; i < numSequences; ++i) {
+      real err = 0;
+      err = editDistance(
+          output.value->getData() + output.value->getWidth() * outputStarts[i],
+          outputStarts[i + 1] - outputStarts[i],
+          output.value->getWidth(),
+          label.ids->getData() + labelStarts[i],
+          labelStarts[i + 1] - labelStarts[i]);
+
+      totalErr += err;
+    }
+
+    return totalErr;
+  }
+
+  virtual void eval(const NeuralNetwork& nn) {
+    Evaluator::eval(nn);
+    std::vector<Argument> arguments;
+    arguments.reserve(config_.input_layers_size());
+    for (const std::string& name : config_.input_layers()) {
+      arguments.push_back(nn.getLayer(name)->getOutput());
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    numSequences_ += arguments[1].getNumSequences();
+  }
+
+  virtual void start() {
+    Evaluator::start();
+    numSequences_ = 0;
+    blank_ = 0;
+    deletions_ = 0;
+    insertions_ = 0;
+    substitutions_ = 0;
+    seqClassficationError_ = 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    storeLocalValues();
+    os << config_.name() << " error = " << evalResults_["error"];
+    os << " deletions error = " << evalResults_["deletion_error"];
+    os << " insertions error = " << evalResults_["insertion_error"];
+    os << " substitution error = " << evalResults_["substitution_error"];
+    os << " sequence error = " << evalResults_["sequence_error"];
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    double buf[6] = {totalScore_,
+                     (double)deletions_,
+                     (double)insertions_,
+                     (double)substitutions_,
+                     (double)seqClassficationError_,
+                     (double)numSequences_};
+    client->reduce(buf, buf, 6, FLAGS_trainer_id, 0);
+    totalScore_ = buf[0];
+    deletions_ = (real)buf[1];
+    insertions_ = (real)buf[2];
+    substitutions_ = (real)buf[3];
+    seqClassficationError_ = (int)buf[4];
+    numSequences_ = (int)buf[5];
+  }
+
+  void getNames(std::vector<std::string>* names) {
+    storeLocalValues();
+    names->reserve(names->size() + evalResults_.size());
+    for (auto it = evalResults_.begin(); it != evalResults_.end(); ++it) {
+      names->push_back(config_.name() + "." + it->first);
+    }
+  }
+
+  real getValue(const std::string& name, Error* err) const {
+    storeLocalValues();
+
+    std::vector<std::string> buffers;
+    paddle::str::split(name, '.', &buffers);
+    auto it = evalResults_.find(buffers[buffers.size() - 1]);
+
+    if (it == evalResults_.end()) {
+      *err = Error("Evaluator does not have the key %s", name.c_str());
+      return 0.0f;
+    }
+
+    return it->second;
+  }
+
+  std::string getType(const std::string& name, Error* err) const {
+    this->getValue(name, err);
+    if (!err->isOK()) {
+      return "";
+    }
+    return "ctc_edit_distance";
+  }
+};
+
+REGISTER_EVALUATOR(ctc_edit_distance, CTCErrorEvaluator);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp b/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ea5c609a63a961875543389fadcea7a86b87398a
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
@@ -0,0 +1,296 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <set>
+#include <vector>
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/utils/StringUtil.h"
+
+#include "Evaluator.h"
+
+namespace paddle {
+
+/**
+ * Chunk evaluator is used to evaluate segment labelling accuracy for a
+ * sequence. It calculates the chunk detection F1 score.
+ *
+ * A chunk is correctly detected if its beginning, end and type are correct.
+ * Other chunk type is ignored.
+ * For each label in the label sequence, we have
+ *
+ * @code
+ * tagType = label % numTagType
+ * chunkType = label / numTagType
+ * otherChunkType = numChunkTypes
+ * @endcode
+ *
+ * The total number of different labels is numTagType*numChunkTypes+1
+ * We support 4 labelling scheme
+ * The tag type for each of the scheme is shown as follows:
+ *
+ * @code
+ *  Scheme Begin Inside End   Single
+ *   plain  0     -      -     -
+ *   IOB    0     1      -     -
+ *   IOE    -     0      1     -
+ *   IOBES  0     1      2     3
+ * @endcode
+ *
+ * 'plain' means the whole chunk must contain exactly the same chunk label.
+ */
+class ChunkEvaluator : public Evaluator {
+  int otherChunkType_;
+  int numChunkTypes_;  // number of chunk types besides other chunk type
+  int numTagTypes_;
+  int tagBegin_;
+  int tagInside_;
+  int tagEnd_;
+  int tagSingle_;
+
+  int64_t numLabelSegments_;
+  int64_t numOutputSegments_;
+  int64_t numCorrect_;
+
+  struct Segment {
+    int begin;
+    int end;
+    int type;
+    bool operator==(const Segment& y) const {
+      return begin == y.begin && end == y.end && type == y.type;
+    }
+  };
+
+  std::vector<Segment> labelSegments_;
+  std::vector<Segment> outputSegments_;
+  std::set<int> excludedChunkTypes_;
+  mutable std::unordered_map<std::string, real> values_;
+
+ public:
+  virtual void init(const EvaluatorConfig& config) {
+    Evaluator::init(config);
+    if (config.chunk_scheme() == "IOB") {
+      numTagTypes_ = 2;
+      tagBegin_ = 0;
+      tagInside_ = 1;
+      tagEnd_ = -1;
+      tagSingle_ = -1;
+    } else if (config.chunk_scheme() == "IOE") {
+      numTagTypes_ = 2;
+      tagBegin_ = -1;
+      tagInside_ = 0;
+      tagEnd_ = 1;
+      tagSingle_ = -1;
+    } else if (config.chunk_scheme() == "IOBES") {
+      numTagTypes_ = 4;
+      tagBegin_ = 0;
+      tagInside_ = 1;
+      tagEnd_ = 2;
+      tagSingle_ = 3;
+    } else if (config.chunk_scheme() == "plain") {
+      numTagTypes_ = 1;
+      tagBegin_ = -1;
+      tagInside_ = -1;
+      tagEnd_ = -1;
+      tagSingle_ = -1;
+    } else {
+      LOG(FATAL) << "Unknown chunk scheme: " << config.chunk_scheme();
+    }
+    CHECK(config.has_num_chunk_types()) << "Missing num_chunk_types in config";
+    otherChunkType_ = numChunkTypes_ = config.num_chunk_types();
+
+    // the chunks of types in excludedChunkTypes_ will not be counted
+    auto& tmp = config.excluded_chunk_types();
+    excludedChunkTypes_.insert(tmp.begin(), tmp.end());
+  }
+
+  virtual void start() {
+    Evaluator::start();
+    numLabelSegments_ = 0;
+    numOutputSegments_ = 0;
+    numCorrect_ = 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    storeLocalValues();
+    os << config_.name() << "=" << values_["F1-score"]
+       << " true_chunks=" << numLabelSegments_
+       << " result_chunks=" << numOutputSegments_
+       << " correct_chunks=" << numCorrect_;
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    int64_t buf[3] = {numLabelSegments_, numOutputSegments_, numCorrect_};
+    client->reduce(buf, buf, 3, FLAGS_trainer_id, 0);
+    numLabelSegments_ = buf[0];
+    numOutputSegments_ = buf[1];
+    numCorrect_ = buf[2];
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    CHECK_EQ(arguments.size(), (size_t)2);
+    IVectorPtr& output = arguments[0].ids;
+    IVectorPtr& label = arguments[1].ids;
+    CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
+    auto sequenceStartPositions =
+        arguments[1].sequenceStartPositions->getVector(false);
+    CHECK_EQ(output->getSize(), label->getSize());
+    CHECK(sequenceStartPositions);
+    size_t numSequences = sequenceStartPositions->getSize() - 1;
+    const int* starts = sequenceStartPositions->getData();
+    for (size_t i = 0; i < numSequences; ++i) {
+      eval1(output->getData() + starts[i],
+            label->getData() + starts[i],
+            starts[i + 1] - starts[i]);
+    }
+    return 0;
+  }
+
+  void eval1(int* output, int* label, int length) {
+    getSegments(output, length, outputSegments_);
+    getSegments(label, length, labelSegments_);
+    size_t i = 0, j = 0;
+    while (i < outputSegments_.size() && j < labelSegments_.size()) {
+      if (outputSegments_[i] == labelSegments_[j] &&
+          excludedChunkTypes_.count(outputSegments_[i].type) != 1) {
+        ++numCorrect_;
+      }
+      if (outputSegments_[i].end < labelSegments_[j].end) {
+        ++i;
+      } else if (outputSegments_[i].end > labelSegments_[j].end) {
+        ++j;
+      } else {
+        ++i;
+        ++j;
+      }
+    }
+    for (auto& segment : labelSegments_) {
+      if (excludedChunkTypes_.count(segment.type) != 1) ++numLabelSegments_;
+    }
+    for (auto& segment : outputSegments_) {
+      if (excludedChunkTypes_.count(segment.type) != 1) ++numOutputSegments_;
+    }
+  }
+
+  void getSegments(int* label, int length, std::vector<Segment>& segments) {
+    segments.clear();
+    segments.reserve(length);
+    int chunkStart = 0;
+    bool inChunk = false;
+    int tag = -1;
+    int type = otherChunkType_;
+    for (int i = 0; i < length; ++i) {
+      int prevTag = tag;
+      int prevType = type;
+      CHECK_LE(label[i], numChunkTypes_ * numTagTypes_);
+      tag = label[i] % numTagTypes_;
+      type = label[i] / numTagTypes_;
+      if (inChunk && isChunkEnd(prevTag, prevType, tag, type)) {
+        Segment segment{
+            chunkStart,  // begin
+            i - 1,       // end
+            prevType,
+        };
+        segments.push_back(segment);
+        inChunk = false;
+      }
+      if (isChunkBegin(prevTag, prevType, tag, type)) {
+        chunkStart = i;
+        inChunk = true;
+      }
+    }
+    if (inChunk) {
+      Segment segment{
+          chunkStart,  // begin
+          length - 1,  // end
+          type,
+      };
+      segments.push_back(segment);
+    }
+  }
+
+  // whether (prevTag, prevType) is the end of a chunk
+  bool isChunkEnd(int prevTag, int prevType, int tag, int type) {
+    if (prevType == otherChunkType_) return false;
+    if (type == otherChunkType_) return true;
+    if (type != prevType) return true;
+    if (prevTag == tagBegin_) return tag == tagBegin_ || tag == tagSingle_;
+    if (prevTag == tagInside_) return tag == tagBegin_ || tag == tagSingle_;
+    if (prevTag == tagEnd_) return true;
+    if (prevTag == tagSingle_) return true;
+    return false;
+  }
+
+  // whether (tag, type) is the beginning of a chunk
+  bool isChunkBegin(int prevTag, int prevType, int tag, int type) {
+    if (prevType == otherChunkType_) return type != otherChunkType_;
+    if (type == otherChunkType_) return false;
+    if (type != prevType) return true;
+    if (tag == tagBegin_) return true;
+    if (tag == tagInside_) return prevTag == tagEnd_ || prevTag == tagSingle_;
+    if (tag == tagEnd_) return prevTag == tagEnd_ || prevTag == tagSingle_;
+    if (tag == tagSingle_) return true;
+    return false;
+  }
+
+  // three metrics: precision, recall and F1-score
+  void getNames(std::vector<std::string>* names) {
+    storeLocalValues();
+    names->reserve(names->size() + values_.size());
+    for (auto it = values_.begin(); it != values_.end(); ++it) {
+      names->push_back(config_.name() + "." + it->first);
+    }
+  }
+
+  // get value by field name
+  real getValue(const std::string& name, Error* err) const {
+    storeLocalValues();
+    std::vector<std::string> buffers;
+    paddle::str::split(name, '.', &buffers);
+    auto it = values_.find(buffers.back());
+    if (it == values_.end()) {  // not found
+      *err = Error("No such key %s", name.c_str());
+      return 0.0f;
+    }
+
+    return it->second;
+  }
+
+  // get type of evaluator
+  std::string getType(const std::string& name, Error* err) const {
+    this->getValue(name, err);
+    if (!err->isOK()) {
+      return "";
+    }
+    return "chunk";
+  }
+
+ private:
+  void storeLocalValues() const {
+    CHECK_GE(numOutputSegments_, 0);
+    CHECK_GE(numLabelSegments_, 0);
+    double precision =
+        !numOutputSegments_ ? 0 : (double)numCorrect_ / numOutputSegments_;
+    double recall =
+        !numLabelSegments_ ? 0 : (double)numCorrect_ / numLabelSegments_;
+    values_["precision"] = precision;
+    values_["recall"] = recall;
+    values_["F1-score"] =
+        !numCorrect_ ? 0 : 2 * precision * recall / (precision + recall);
+  }
+};
+
+REGISTER_EVALUATOR(chunk, ChunkEvaluator);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..57657241f8c1517f674670d34cb984b85996bfc7
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp
@@ -0,0 +1,308 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Evaluator.h"
+#include "paddle/legacy/gserver/layers/DetectionUtil.h"
+
+using std::map;
+using std::vector;
+using std::pair;
+using std::make_pair;
+
+namespace paddle {
+
+/**
+ * @brief detection map Evaluator
+ *
+ * The config file api is detection_map_evaluator.
+ */
+class DetectionMAPEvaluator : public Evaluator {
+ public:
+  DetectionMAPEvaluator()
+      : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {}
+
+  virtual void start() {
+    Evaluator::start();
+    allTruePos_.clear();
+    allFalsePos_.clear();
+    numPos_.clear();
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    overlapThreshold_ = config_.overlap_threshold();
+    backgroundId_ = config_.background_id();
+    evaluateDifficult_ = config_.evaluate_difficult();
+    apType_ = config_.ap_type();
+
+    MatrixPtr detectTmpValue = arguments[0].value;
+    Matrix::resizeOrCreate(cpuOutput_,
+                           detectTmpValue->getHeight(),
+                           detectTmpValue->getWidth(),
+                           false,
+                           false);
+
+    MatrixPtr labelTmpValue = arguments[1].value;
+    Matrix::resizeOrCreate(cpuLabel_,
+                           labelTmpValue->getHeight(),
+                           labelTmpValue->getWidth(),
+                           false,
+                           false);
+
+    cpuOutput_->copyFrom(*detectTmpValue);
+    cpuLabel_->copyFrom(*labelTmpValue);
+
+    Argument label = arguments[1];
+    const int* labelIndex = label.sequenceStartPositions->getData(false);
+    size_t batchSize = label.getNumSequences();
+
+    vector<map<size_t, vector<NormalizedBBox>>> allGTBBoxes;
+    vector<map<size_t, vector<pair<real, NormalizedBBox>>>> allDetectBBoxes;
+
+    for (size_t n = 0; n < batchSize; ++n) {
+      map<size_t, vector<NormalizedBBox>> bboxes;
+      for (int i = labelIndex[n]; i < labelIndex[n + 1]; ++i) {
+        vector<NormalizedBBox> bbox;
+        getBBoxFromLabelData(cpuLabel_->getData() + i * 6, 1, bbox);
+        int c = cpuLabel_->getData()[i * 6];
+        bboxes[c].push_back(bbox[0]);
+      }
+      allGTBBoxes.push_back(bboxes);
+    }
+
+    size_t n = 0;
+    const real* cpuOutputData = cpuOutput_->getData();
+    for (size_t imgId = 0; imgId < batchSize; ++imgId) {
+      map<size_t, vector<pair<real, NormalizedBBox>>> bboxes;
+      size_t curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
+      while (curImgId == imgId && n < cpuOutput_->getHeight()) {
+        vector<real> label;
+        vector<real> score;
+        vector<NormalizedBBox> bbox;
+        getBBoxFromDetectData(cpuOutputData + n * 7, 1, label, score, bbox);
+        bboxes[label[0]].push_back(make_pair(score[0], bbox[0]));
+        ++n;
+        curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
+      }
+      allDetectBBoxes.push_back(bboxes);
+    }
+
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (map<size_t, vector<NormalizedBBox>>::iterator it =
+               allGTBBoxes[n].begin();
+           it != allGTBBoxes[n].end();
+           ++it) {
+        size_t count = 0;
+        if (evaluateDifficult_) {
+          count = it->second.size();
+        } else {
+          for (size_t i = 0; i < it->second.size(); ++i)
+            if (!(it->second[i].isDifficult)) ++count;
+        }
+        if (numPos_.find(it->first) == numPos_.end() && count != 0) {
+          numPos_[it->first] = count;
+        } else {
+          numPos_[it->first] += count;
+        }
+      }
+    }
+
+    // calcTFPos
+    calcTFPos(batchSize, allGTBBoxes, allDetectBBoxes);
+
+    return 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    real mAP = calcMAP();
+    os << "Detection mAP=" << mAP;
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    LOG(FATAL) << "Distribute detection evaluation not implemented.";
+  }
+
+ protected:
+  void calcTFPos(const size_t batchSize,
+                 const vector<map<size_t, vector<NormalizedBBox>>>& allGTBBoxes,
+                 const vector<map<size_t, vector<pair<real, NormalizedBBox>>>>&
+                     allDetectBBoxes) {
+    for (size_t n = 0; n < allDetectBBoxes.size(); ++n) {
+      if (allGTBBoxes[n].size() == 0) {
+        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
+                 it = allDetectBBoxes[n].begin();
+             it != allDetectBBoxes[n].end();
+             ++it) {
+          size_t label = it->first;
+          for (size_t i = 0; i < it->second.size(); ++i) {
+            allTruePos_[label].push_back(make_pair(it->second[i].first, 0));
+            allFalsePos_[label].push_back(make_pair(it->second[i].first, 1));
+          }
+        }
+      } else {
+        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
+                 it = allDetectBBoxes[n].begin();
+             it != allDetectBBoxes[n].end();
+             ++it) {
+          size_t label = it->first;
+          vector<pair<real, NormalizedBBox>> predBBoxes = it->second;
+          if (allGTBBoxes[n].find(label) == allGTBBoxes[n].end()) {
+            for (size_t i = 0; i < predBBoxes.size(); ++i) {
+              allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
+              allFalsePos_[label].push_back(make_pair(predBBoxes[i].first, 1));
+            }
+          } else {
+            vector<NormalizedBBox> gtBBoxes =
+                allGTBBoxes[n].find(label)->second;
+            vector<bool> visited(gtBBoxes.size(), false);
+            // Sort detections in descend order based on scores
+            std::sort(predBBoxes.begin(),
+                      predBBoxes.end(),
+                      sortScorePairDescend<NormalizedBBox>);
+            for (size_t i = 0; i < predBBoxes.size(); ++i) {
+              real maxOverlap = -1.0;
+              size_t maxIdx = 0;
+              for (size_t j = 0; j < gtBBoxes.size(); ++j) {
+                real overlap =
+                    jaccardOverlap(predBBoxes[i].second, gtBBoxes[j]);
+                if (overlap > maxOverlap) {
+                  maxOverlap = overlap;
+                  maxIdx = j;
+                }
+              }
+              if (maxOverlap > overlapThreshold_) {
+                if (evaluateDifficult_ ||
+                    (!evaluateDifficult_ && !gtBBoxes[maxIdx].isDifficult)) {
+                  if (!visited[maxIdx]) {
+                    allTruePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 1));
+                    allFalsePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 0));
+                    visited[maxIdx] = true;
+                  } else {
+                    allTruePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 0));
+                    allFalsePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 1));
+                  }
+                }
+              } else {
+                allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
+                allFalsePos_[label].push_back(
+                    make_pair(predBBoxes[i].first, 1));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  real calcMAP() const {
+    real mAP = 0.0;
+    size_t count = 0;
+    for (map<size_t, size_t>::const_iterator it = numPos_.begin();
+         it != numPos_.end();
+         ++it) {
+      size_t label = it->first;
+      size_t labelNumPos = it->second;
+      if (labelNumPos == 0 || allTruePos_.find(label) == allTruePos_.end())
+        continue;
+      vector<pair<real, size_t>> labelTruePos = allTruePos_.find(label)->second;
+      vector<pair<real, size_t>> labelFalsePos =
+          allFalsePos_.find(label)->second;
+      // Compute average precision.
+      vector<size_t> tpCumSum;
+      getAccumulation(labelTruePos, &tpCumSum);
+      vector<size_t> fpCumSum;
+      getAccumulation(labelFalsePos, &fpCumSum);
+      std::vector<real> precision, recall;
+      size_t num = tpCumSum.size();
+      // Compute Precision.
+      for (size_t i = 0; i < num; ++i) {
+        CHECK_LE(tpCumSum[i], labelNumPos);
+        precision.push_back(static_cast<real>(tpCumSum[i]) /
+                            static_cast<real>(tpCumSum[i] + fpCumSum[i]));
+        recall.push_back(static_cast<real>(tpCumSum[i]) / labelNumPos);
+      }
+      // VOC2007 style
+      if (apType_ == "11point") {
+        vector<real> maxPrecisions(11, 0.0);
+        int startIdx = num - 1;
+        for (int j = 10; j >= 0; --j)
+          for (int i = startIdx; i >= 0; --i) {
+            if (recall[i] < j / 10.) {
+              startIdx = i;
+              if (j > 0) maxPrecisions[j - 1] = maxPrecisions[j];
+              break;
+            } else {
+              if (maxPrecisions[j] < precision[i])
+                maxPrecisions[j] = precision[i];
+            }
+          }
+        for (int j = 10; j >= 0; --j) mAP += maxPrecisions[j] / 11;
+        ++count;
+      } else if (apType_ == "Integral") {
+        // Nature integral
+        real averagePrecisions = 0.;
+        real prevRecall = 0.;
+        for (size_t i = 0; i < num; ++i) {
+          if (fabs(recall[i] - prevRecall) > 1e-6)
+            averagePrecisions += precision[i] * fabs(recall[i] - prevRecall);
+          prevRecall = recall[i];
+        }
+        mAP += averagePrecisions;
+        ++count;
+      } else {
+        LOG(FATAL) << "Unkown ap version: " << apType_;
+      }
+    }
+    if (count != 0) mAP /= count;
+    return mAP * 100;
+  }
+
+  void getAccumulation(vector<pair<real, size_t>> inPairs,
+                       vector<size_t>* accuVec) const {
+    std::stable_sort(
+        inPairs.begin(), inPairs.end(), sortScorePairDescend<size_t>);
+    accuVec->clear();
+    size_t sum = 0;
+    for (size_t i = 0; i < inPairs.size(); ++i) {
+      sum += inPairs[i].second;
+      accuVec->push_back(sum);
+    }
+  }
+
+  std::string getTypeImpl() const { return "detection_map"; }
+
+  real getValueImpl() const { return calcMAP(); }
+
+ private:
+  real overlapThreshold_;  // overlap threshold when determining whether matched
+  bool evaluateDifficult_;  // whether evaluate difficult ground truth
+  size_t backgroundId_;     // class index of background
+  std::string apType_;      // how to calculate mAP (Integral or 11point)
+
+  MatrixPtr cpuOutput_;
+  MatrixPtr cpuLabel_;
+
+  map<size_t, size_t> numPos_;  // counts of true objects each classification
+  map<size_t, vector<pair<real, size_t>>>
+      allTruePos_;  // true positive prediction
+  map<size_t, vector<pair<real, size_t>>>
+      allFalsePos_;  // false positive prediction
+};
+
+REGISTER_EVALUATOR(detection_map, DetectionMAPEvaluator);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/Evaluator.cpp b/paddle/legacy/gserver/evaluators/Evaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..436c33e43b400514608c5ebc8146b214a12b5505
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/Evaluator.cpp
@@ -0,0 +1,1361 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/gserver/evaluators/Evaluator.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/StringUtil.h"
+
+DECLARE_int32(trainer_id);
+
+namespace paddle {
+
+void Evaluator::eval(const NeuralNetwork& nn) {
+  std::vector<Argument> arguments;
+  arguments.reserve(config_.input_layers_size());
+  for (const std::string& name : config_.input_layers()) {
+    arguments.push_back(nn.getLayer(name)->getOutput());
+  }
+  SetDevice device(arguments[0].deviceId);
+  real score = evalImp(arguments);
+  totalScore_ += score;
+  updateSamplesNum(arguments);
+}
+/**
+ * @brief classification error Evaluator
+ *
+ * The config file api is classification_error_evaluator.
+ */
+class ClassificationErrorEvaluator : public Evaluator {
+ public:
+  /*
+  ClassificationErrorEvaluator() : totalScore2_(0) {}
+
+  virtual void start() {
+    Evaluator::start();
+    totalScore2_ = 0;
+    } */
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    if (3 == arguments.size()) {
+      numSamples_ += arguments[2].value->getSum();
+    } else {
+      numSamples_ += arguments[0].getBatchSize();
+    }
+  }
+
+  MatrixPtr calcError(std::vector<Argument>& arguments) {
+    CHECK_GE(arguments.size(), (size_t)2);
+    CHECK_LE(arguments.size(), (size_t)3);
+    MatrixPtr& output = arguments[0].value;
+    IVectorPtr& label = arguments[1].ids;
+    MatrixPtr& multiBinaryLabel = arguments[1].value;  // For multi binary label
+    bool supportWeight = (3 == arguments.size()) ? true : false;
+    MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
+    if (nullptr == output ||
+        (nullptr == label && nullptr == multiBinaryLabel) ||
+        (supportWeight && nullptr == weight)) {
+      return 0;
+    }
+
+    if (label != nullptr) {
+      CHECK_EQ(label->getSize(), output->getHeight());
+    } else {
+      CHECK_EQ(multiBinaryLabel->getHeight(), output->getHeight());
+      CHECK_EQ(multiBinaryLabel->getWidth(), output->getWidth());
+    }
+    if (supportWeight) {
+      CHECK_EQ(output->getHeight(), weight->getHeight());
+      CHECK_EQ((size_t)1, weight->getWidth());
+    }
+
+    const MatrixPtr errorMat = Matrix::create(output->getHeight(),
+                                              1,
+                                              /* trans= */ false,
+                                              useGpu(arguments[0].deviceId));
+
+    errorMat->zeroMem();
+
+    if (label != nullptr) {
+      errorMat->classificationError(*output, *label, config_.top_k());
+    } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
+               dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
+      errorMat->classificationErrorMulti(
+          *output, *multiBinaryLabel, config_.classification_threshold());
+    } else {
+      errorMat->binaryClassificationError(
+          0, *output, *multiBinaryLabel, config_.classification_threshold());
+    }
+
+    if (supportWeight) {
+      errorMat->dotMul(*errorMat, *weight);
+    }
+    return errorMat;
+  }
+
+  void printStats(std::ostream& os) const {
+    if (config_.top_k() == 1) {
+      os << config_.name() << "="
+         << (numSamples_ ? totalScore_ / numSamples_ : 0);
+    } else {
+      os << " top_" << config_.top_k()
+         << "_error=" << (numSamples_ ? totalScore_ / numSamples_ : 0);
+    }
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    MatrixPtr errorMat = calcError(arguments);
+    return errorMat->getSum();
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const { return "classification_error"; }
+};
+
+/**
+ * @brief sequence classification error Evaluator
+ * @note sequence level classification error stats,
+ * if any frame in one sequence has error, the sequence is error
+ */
+class SequenceClassificationErrorEvaluator
+    : public ClassificationErrorEvaluator {
+ public:
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    numSamples_ += arguments[0].getNumSequences();
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    auto sequenceStartPositions =
+        arguments[0].sequenceStartPositions->getVector(false);
+    CHECK(sequenceStartPositions != nullptr);
+    const int* starts = sequenceStartPositions->getData();
+
+    MatrixPtr errorMat = calcError(arguments);
+
+    int errCounter = 0;
+    CpuVector errorVec(0, nullptr);
+    for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) {
+      errorVec.subVecFrom(
+          errorMat->getData(), starts[i], starts[i + 1] - starts[i]);
+      if (errorVec.getSum() > 0) {
+        errCounter += 1;
+      }
+    }
+
+    return static_cast<real>(errCounter);
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const { return "seq_classification_error"; }
+};
+REGISTER_EVALUATOR(seq_classification_error,
+                   SequenceClassificationErrorEvaluator);
+/**
+ * @brief sum Evaluator
+ * Calculate the sum of output or label
+ *
+ * The config file api is sum_evaluator.
+ */
+class SumEvaluator : public Evaluator {
+ public:
+  SumEvaluator() : cpuLabel_(nullptr), cpuWeight_(nullptr) {}
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    if (2 == arguments.size()) {
+      numSamples_ += arguments[1].value->getSum();
+    } else {
+      numSamples_ += arguments[0].getBatchSize();
+    }
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    REGISTER_TIMER("SumEvaluator");
+    CHECK_GE(arguments.size(), (size_t)1);
+    CHECK_LE(arguments.size(), (size_t)2);
+    bool supportWeight = (2 == arguments.size()) ? true : false;
+    if (supportWeight) {
+      if (nullptr == arguments[1].value) {
+        return 0;
+      }
+      CHECK_EQ(arguments[1].value->getWidth(), (size_t)1);
+    }
+
+    // The sum of output
+    if (arguments[0].value) {
+      if (supportWeight) {
+        CHECK_EQ(arguments[0].value->getHeight(),
+                 arguments[1].value->getHeight());
+        MatrixPtr tmpMat = Matrix::create(arguments[0].value->getHeight(),
+                                          arguments[0].value->getWidth(),
+                                          /* trans= */ false,
+                                          arguments[0].value->useGpu());
+        tmpMat->copyFrom(*arguments[0].value);
+        tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
+        return tmpMat->getSum();
+      } else {
+        return arguments[0].value->getSum();
+      }
+      // The sum of label
+    } else if (arguments[0].ids) {
+      size_t insNum = arguments[0].ids->getSize();
+      IVectorPtr label = arguments[0].ids;
+      MatrixPtr weight = supportWeight ? arguments[1].value : nullptr;
+      if (dynamic_cast<GpuIVector*>(label.get())) {
+        IVector::resizeOrCreate(cpuLabel_, insNum, false);
+        cpuLabel_->copyFrom(*arguments[0].ids);
+
+        if (supportWeight) {
+          CHECK_EQ(insNum, arguments[1].value->getHeight());
+          Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
+          cpuWeight_->copyFrom(*arguments[1].value);
+        }
+
+        label = cpuLabel_;
+        weight = cpuWeight_;
+      }
+
+      if (supportWeight) {
+        real score = 0.0;
+        int* labelD = label->getData();
+        real* weightD = weight->getData();
+        for (size_t i = 0; i < insNum; ++i) {
+          score += (labelD[i] * weightD[i]);
+        }
+        return score;
+      } else {
+        return label->getSum();
+      }
+    } else {
+      return 0;
+    }
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+
+ private:
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const { return "sum"; }
+};
+/**
+ * @brief column sum Evaluator
+ * @note column sum for the colIdx-th column *
+ * - colIdx = 0: the 0-th column.
+ * - colIdx > 0: the colIdx-th column.
+ * - colIdx < 0: the last colIdx-th column.
+ *
+ * The config file api is column_sum_evaluator.
+ *
+ */
+class ColumnSumEvaluator : public Evaluator {
+ public:
+  explicit ColumnSumEvaluator(int32_t colIdx)
+      : colIdx_(colIdx), colNum_(0), sum_(nullptr) {}
+
+  virtual void start() {
+    Evaluator::start();
+    if (nullptr != sum_) {
+      sum_->zeroMem();
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    if (2 == arguments.size()) {
+      numSamples_ += arguments[1].value->getSum();
+    } else {
+      numSamples_ += arguments[0].getBatchSize();
+    }
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    REGISTER_TIMER("ColumnSumEvaluator");
+    CHECK_GE(arguments.size(), (size_t)1);
+    CHECK_LE(arguments.size(), (size_t)2);
+    bool supportWeight = (2 == arguments.size()) ? true : false;
+    if (nullptr == arguments[0].value ||
+        (supportWeight && nullptr == arguments[1].value)) {
+      return 0;
+    }
+
+    size_t insNum = arguments[0].value->getHeight();
+    size_t colNum = arguments[0].value->getWidth();
+    if (nullptr == sum_) {
+      sum_ = Matrix::create((size_t)1, colNum, false, /* useGpu */ false);
+      colNum_ = colNum;
+      sum_->zeroMem();
+    } else {
+      CHECK_EQ(colNum, sum_->getWidth());
+    }
+
+    if (supportWeight) {
+      CHECK_EQ(insNum, arguments[1].value->getHeight());
+      CHECK_EQ((size_t)1, arguments[1].value->getWidth());
+      MatrixPtr tmpMat = Matrix::create(insNum, colNum);
+      if (arguments[0].value->useGpu()) {
+        tmpMat->copyFrom(*arguments[0].value);
+      }
+      if (!arguments[1].value->useGpu()) {
+        if (!arguments[0].value->useGpu()) {
+          tmpMat->rowScale(0, *arguments[0].value, *arguments[1].value);
+        } else {
+          tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
+        }
+      } else {
+        MatrixPtr tmp2 = Matrix::create(insNum, 1);
+        tmp2->copyFrom(*arguments[1].value);
+        if (!arguments[0].value->useGpu()) {
+          tmpMat->rowScale(0, *arguments[0].value, *tmp2);
+        } else {
+          tmpMat->rowScale(0, *tmpMat, *tmp2);
+        }
+      }
+      sum_->accumulateColSum(*tmpMat);
+    } else {
+      if (!arguments[0].value->useGpu()) {
+        sum_->accumulateColSum(*arguments[0].value);
+      } else {
+        MatrixPtr tmpMat = Matrix::create(insNum, colNum);
+        tmpMat->copyFrom(*arguments[0].value);
+        sum_->accumulateColSum(*tmpMat);
+      }
+    }
+    return 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    CHECK(colIdx_ + (int32_t)colNum_ >= 0 && colIdx_ - (int32_t)colNum_ < 0)
+        << "column index [" << colIdx_ << "] out of range [-" << colNum_ << ", "
+        << colNum_ << ")";
+    size_t colIdx = 0;
+    if (colIdx_ >= 0) {
+      colIdx = colIdx_;
+    } else {
+      colIdx = colNum_ + colIdx_;
+    }
+    os << config_.name() << "="
+       << (numSamples_ ? sum_->getElement(0, colIdx) / numSamples_ : 0);
+  }
+
+  void distributeEval(ParameterClient2* client) {
+    client->reduce(
+        sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, 0);
+    client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
+  }
+
+ private:
+  int32_t colIdx_;
+  size_t colNum_;
+  MatrixPtr sum_; /* cpu matrix */
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const {
+    if (colIdx_ == -1)
+      return "last-column-sum";
+    else
+      return "column-sum";
+  }
+};
+
+void AucEvaluator::start() {
+  Evaluator::start();
+  memset(statPos_, 0, sizeof(statPos_));
+  memset(statNeg_, 0, sizeof(statNeg_));
+}
+
+real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
+  REGISTER_TIMER("AucEvaluator");
+  CHECK_GE(arguments.size(), (size_t)2);
+  CHECK_LE(arguments.size(), (size_t)3);
+  MatrixPtr output = arguments[0].value;
+  IVectorPtr label = arguments[1].ids;
+  MatrixPtr labelval = arguments[1].value;
+  bool supportWeight = (3 == arguments.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
+
+  if (nullptr == output || (supportWeight && nullptr == weight)) {
+    return 0;
+  }
+  size_t insNum = output->getHeight();
+  size_t outputDim = output->getWidth();
+  // Copy label from value to a vector.
+  if (nullptr == label && nullptr != labelval) {
+    // label width is 1
+    CHECK_EQ(1U, labelval->getWidth());
+    VectorPtr vec =
+        Vector::create(labelval->getData(), insNum, output->useGpu());
+    label = vec->castToInt();
+  }
+
+  CHECK_EQ(insNum, label->getSize());
+  if (supportWeight) {
+    CHECK_EQ(insNum, weight->getHeight());
+    CHECK_EQ((size_t)1, weight->getWidth());
+  }
+
+  CHECK(colIdx_ + (int32_t)outputDim >= 0 && colIdx_ - (int32_t)outputDim < 0)
+      << "column index [" << colIdx_ << "] out of range [-" << outputDim << ", "
+      << outputDim << ")";
+  realColumnIdx_ = 0;
+  if (colIdx_ >= 0) {
+    realColumnIdx_ = colIdx_;
+  } else {
+    realColumnIdx_ = outputDim + colIdx_;
+  }
+
+  if (dynamic_cast<GpuMatrix*>(output.get())) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                           insNum,
+                           outputDim,
+                           /* trans=*/false,
+                           /* useGpu=*/false);
+    cpuOutput_->copyFrom(*output);
+    IVector::resizeOrCreate(cpuLabel_, insNum, false);
+    cpuLabel_->copyFrom(*label);
+
+    if (supportWeight) {
+      Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
+      cpuWeight_->copyFrom(*weight);
+    }
+
+    output = cpuOutput_;
+    label = cpuLabel_;
+    weight = cpuWeight_;
+  }
+
+  real* outputD = output->getData();
+  int* labelD = label->getData();
+  real* weightD = supportWeight ? weight->getData() : nullptr;
+  size_t pos = realColumnIdx_;
+
+  for (size_t i = 0; i < insNum; ++i) {
+    real value = outputD[pos];
+    uint32_t binIdx = static_cast<uint32_t>(value * kBinNum_);
+    CHECK(binIdx <= kBinNum_) << "bin index [" << binIdx
+                              << "] out of range, predict value[" << value
+                              << "]";
+    real w = supportWeight ? weightD[i] : 1.0;
+    if (labelD[i] == kNegativeLabel_) {
+      statNeg_[binIdx] += w;
+    } else {
+      statPos_[binIdx] += w;
+    }
+    pos += outputDim;
+  }
+  return 0;
+}
+
+void AucEvaluator::distributeEval(ParameterClient2* client) {
+  client->reduce(statPos_, statPos_, kBinNum_ + 1, FLAGS_trainer_id, 0);
+  client->reduce(statNeg_, statNeg_, kBinNum_ + 1, FLAGS_trainer_id, 0);
+}
+
+double AucEvaluator::calcAuc() const {
+  double totPos = 0.0;
+  double totNeg = 0.0;
+  double totPosPrev = 0.0;
+  double totNegPrev = 0.0;
+  double auc = 0.0;
+
+  int64_t idx = kBinNum_;
+  while (idx >= 0) {
+    totPosPrev = totPos;
+    totNegPrev = totNeg;
+    totPos += statPos_[idx];
+    totNeg += statNeg_[idx];
+    auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
+    --idx;
+  }
+
+  if (totPos > 0.0 && totNeg > 0.0) {
+    return auc / totPos / totNeg;
+  } else {
+    return 0.0;
+  }
+}
+
+real AucEvaluator::getValueImpl() const { return calcAuc(); }
+
+std::string AucEvaluator::getTypeImpl() const {
+  if (colIdx_ == -1) {
+    return "last-column-auc";
+  } else {
+    return "auc";
+  }
+}
+
+// class RankAucEvaluator
+REGISTER_EVALUATOR(rankauc, RankAucEvaluator);
+
+void RankAucEvaluator::start() { Evaluator::start(); }
+void RankAucEvaluator::updateSamplesNum(
+    const std::vector<Argument>& arguments) {
+  numSamples_ += arguments[0].getNumSequences();
+}
+real RankAucEvaluator::evalImp(std::vector<Argument>& arguments) {
+  CHECK_GE(arguments.size(), 2U);
+  CHECK_LE(arguments.size(), 3U);
+  double batchAuc = 0.0;
+  output_ = arguments[0].value;
+  click_ = arguments[1].value;
+  size_t batchSize = output_->getHeight();
+  CHECK(!output_->useGpu()) << "RankAUC evaluator does not support GPU!";
+
+  if (arguments.size() == 3U) {
+    pv_ = arguments[2].value;
+  } else {
+    Matrix::resizeOrCreate(pv_, batchSize, 1, false, false);
+    std::fill(pv_->getData(), pv_->getData() + batchSize, 1.0);
+  }
+
+  real* outputData = output_->getData();
+  real* clickData = click_->getData();
+  real* pvData = pv_->getData();
+
+  auto startPos = arguments[0].sequenceStartPositions->getVector(false);
+  const int* startPosData = startPos->getData();
+  size_t batchNum = startPos->getSize() - 1;
+  for (size_t i = 0; i < batchNum; ++i) {
+    int beginPos = startPosData[i];
+    int endPos = startPosData[i + 1];
+    batchAuc += calcRankAuc(outputData + beginPos,
+                            clickData + beginPos,
+                            pvData + beginPos,
+                            endPos - beginPos);
+  }
+  return batchAuc;
+}
+
+double RankAucEvaluator::calcRankAuc(real* outputData,
+                                     real* clickData,
+                                     real* pvData,
+                                     size_t size) {
+  outputPair_.clear();
+  for (size_t i = 0; i < size; ++i) {
+    outputPair_.push_back(std::make_pair(outputData[i], i));
+  }
+  std::sort(outputPair_.begin(),
+            outputPair_.end(),
+            [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+              return a.first > b.first;
+            });
+  double aucTmp = 0.0;
+  double clickSum = 0.0;
+  double oldClickSum = 0.0;
+  double noClick = 0.0;
+  double noClickSum = 0.0;
+
+  double lastScore = outputPair_[0].first + 1.0;
+  for (size_t i = 0; i < size; ++i) {
+    if (lastScore != outputPair_[i].first) {
+      aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
+      oldClickSum = clickSum;
+      noClick = 0.0;
+      lastScore = outputPair_[i].first;
+    }
+    size_t id = outputPair_[i].second;
+    noClick += pvData[id] - clickData[id];
+    noClickSum += noClick;
+    clickSum += clickData[id];
+  }
+  aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
+  return (clickSum * noClickSum) == 0.0 ? 0.0
+                                        : aucTmp / (clickSum * noClickSum);
+}
+
+std::string RankAucEvaluator::getTypeImpl() const { return "rankauc"; }
+
+// class PrecisionRecallEvaluator
+REGISTER_EVALUATOR(precision_recall, PrecisionRecallEvaluator);
+
+void PrecisionRecallEvaluator::start() {
+  Evaluator::start();
+  statsInfo_.clear();
+  values_.clear();
+}
+
+real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
+  REGISTER_TIMER("PrecisionRecallEvaluator");
+  CHECK_GE(arguments.size(), (size_t)2);
+  CHECK_LE(arguments.size(), (size_t)3);
+  MatrixPtr output = arguments[0].value;
+  IVectorPtr label = arguments[1].ids;
+  MatrixPtr multiBinaryLabel = arguments[1].value;
+  bool supportWeight = (3 == arguments.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
+  if (nullptr == output || (nullptr == label && nullptr == multiBinaryLabel) ||
+      (supportWeight && nullptr == weight)) {
+    return 0;
+  }
+
+  size_t insNum = output->getHeight();
+  size_t outputDim = output->getWidth();
+  if (label != nullptr) {
+    CHECK_EQ(insNum, label->getSize());
+  } else {
+    CHECK_EQ(insNum, multiBinaryLabel->getHeight());
+    CHECK_EQ(outputDim, multiBinaryLabel->getWidth());
+  }
+  if (supportWeight) {
+    CHECK_EQ(insNum, weight->getHeight());
+    CHECK_EQ((size_t)1, weight->getWidth());
+  }
+
+  if (statsInfo_.size() != outputDim) {
+    statsInfo_.clear();
+    statsInfo_.resize(outputDim);
+  }
+
+  isMultiBinaryLabel_ = (nullptr == label) ? true : false;
+  if (label != nullptr) {
+    if (dynamic_cast<GpuMatrix*>(output.get())) {
+      Matrix::resizeOrCreate(cpuOutput_, insNum, outputDim, false, false);
+      cpuOutput_->copyFrom(*output);
+      IVector::resizeOrCreate(cpuLabel_, insNum, false);
+      cpuLabel_->copyFrom(*label);
+      if (supportWeight) {
+        Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
+        cpuWeight_->copyFrom(*weight);
+      }
+
+      output = cpuOutput_;
+      label = cpuLabel_;
+      weight = cpuWeight_;
+    }
+    calcStatsInfo(output, label, weight);
+  } else {
+    // Not support GPU for multi binary labels
+    CHECK(dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()));
+    calcStatsInfoMulti(output, multiBinaryLabel, weight);
+  }
+  return 0;
+}
+
+void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
+  PrintStatsInfo info;
+  bool containMacroMicroInfo = getStatsInfo(&info);
+  os << "positive_label=" << config_.positive_label()
+     << " precision=" << info.precision << " recall=" << info.recall
+     << " F1-score=" << info.f1;
+  if (containMacroMicroInfo) {
+    os << "macro-average-precision=" << info.macroAvgPrecision
+       << " macro-average-recall=" << info.macroAvgRecall
+       << " macro-average-F1-score=" << info.macroAvgF1Score;
+    if (!isMultiBinaryLabel_) {
+      // precision and recall are equal in this case
+      os << " micro-average-precision=" << info.microAvgPrecision;
+    } else {
+      os << " micro-average-precision=" << info.microAvgPrecision
+         << " micro-average-recall=" << info.microAvgRecall
+         << " micro-average-F1-score=" << info.microAvgF1Score;
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::calcStatsInfo(const MatrixPtr& output,
+                                             const IVectorPtr& label,
+                                             const MatrixPtr& weight) {
+  size_t insNum = output->getHeight();
+  size_t dim = output->getWidth();
+  real* outputD = output->getData();
+  int* labelD = label->getData();
+  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
+  for (size_t i = 0; i < insNum; ++i) {
+    CHECK_GE(labelD[i], 0);
+    CHECK_LT((size_t)labelD[i], dim);
+    size_t maxIdx = 0;
+    real maxValue = outputD[i * dim];
+    for (size_t j = 1; j < dim; ++j) {
+      size_t idx = i * dim + j;
+      if (maxValue < outputD[idx]) {
+        maxIdx = j;
+        maxValue = outputD[idx];
+      }
+    }
+
+    real w = (weightD != nullptr) ? weightD[i] : 1.0;
+    if (maxIdx == (size_t)labelD[i]) {
+      statsInfo_[maxIdx].TP += w;  // true positive for labelD[i]
+      // true negative for all labels except for labelD[i]
+      for (size_t j = 0; j < dim; ++j) {
+        statsInfo_[j].TN += w;
+      }
+      statsInfo_[maxIdx].TN -= w;
+    } else {
+      statsInfo_[labelD[i]].FN += w;  // false negative for labelD[i]
+      statsInfo_[maxIdx].FP += w;     // false positive for maxIdx
+      // true negatives for all labels except for maxIdx and labelD[i]
+      for (size_t j = 0; j < dim; ++j) {
+        statsInfo_[j].TN += w;
+      }
+      statsInfo_[maxIdx].TN -= w;
+      statsInfo_[labelD[i]].TN -= w;
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::calcStatsInfoMulti(const MatrixPtr& output,
+                                                  const MatrixPtr& label,
+                                                  const MatrixPtr& weight) {
+  size_t insNum = output->getHeight();
+  size_t dim = output->getWidth();
+  real* outputD = output->getData();
+  auto labelD = dynamic_cast<CpuSparseMatrix*>(label.get());
+  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
+  real threshold = config_.classification_threshold();
+  for (size_t i = 0; i < insNum; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      real w = (weightD != nullptr) ? weightD[i] : 1.0;
+      size_t idx = i * dim + j;
+      if (outputD[idx] < threshold) {
+        statsInfo_[j].TN += w;  // true negative
+      } else {
+        statsInfo_[j].FP += w;  // false positive
+      }
+    }
+
+    const int* cols = labelD->getRowCols(i);
+    for (size_t j = 0; j < labelD->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      real w = (weightD != nullptr) ? weightD[i] : 1.0;
+      size_t idx = i * dim + cols[j];
+      if (outputD[idx] < threshold) {
+        statsInfo_[cols[j]].FN += w;  // false negative
+        statsInfo_[cols[j]].TN -= w;  // true negative
+      } else {
+        statsInfo_[cols[j]].TP += w;  // true positive
+        statsInfo_[cols[j]].FP -= w;  // false positive
+      }
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::storeLocalValues() const {
+  if (this->values_.size() == 0) {
+    PrintStatsInfo info;
+    bool containMacroMicroInfo = getStatsInfo(&info);
+    values_["precision"] = info.precision;
+    values_["recal"] = info.recall;
+    values_["F1-score"] = info.f1;
+    if (containMacroMicroInfo) {
+      values_["macro-average-precision"] = info.macroAvgPrecision;
+      values_["macro-average-recall"] = info.macroAvgRecall;
+      values_["macro-average-F1-score"] = info.macroAvgF1Score;
+      if (!isMultiBinaryLabel_) {
+        // precision and recall are equal in this case
+        values_["micro-average-precision"] = info.microAvgPrecision;
+      } else {
+        values_["micro-average-precision"] = info.microAvgPrecision;
+        values_["micro-average-recall"] = info.microAvgRecall;
+        values_["micro-average-F1-score"] = info.microAvgF1Score;
+      }
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::getNames(std::vector<std::string>* names) {
+  this->storeLocalValues();
+  names->reserve(this->values_.size());
+  for (auto it = this->values_.begin(); it != this->values_.end(); ++it) {
+    names->push_back(this->config_.name() + "." + it->first);
+  }
+}
+
+real PrecisionRecallEvaluator::getValue(const std::string& name,
+                                        Error* err) const {
+  this->storeLocalValues();
+  std::vector<std::string> buffers;
+  paddle::str::split(name, '.', &buffers);
+  auto it = this->values_.find(buffers[buffers.size() - 1]);
+  if (it == this->values_.end()) {  // not found
+    *err = Error("No such key %s", name.c_str());
+    return .0f;
+  }
+
+  return it->second;
+}
+
+std::string PrecisionRecallEvaluator::getType(const std::string& name,
+                                              Error* err) const {
+  this->getValue(name, err);
+  if (!err->isOK()) {
+    return "";
+  }
+  return "precision_recall";
+}
+
+void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
+  size_t size = 4 * statsInfo_.size();
+  double* buf = new double[size];
+  for (size_t i = 0; i < statsInfo_.size(); ++i) {
+    buf[4 * i + 0] = statsInfo_[i].TP;
+    buf[4 * i + 1] = statsInfo_[i].TN;
+    buf[4 * i + 2] = statsInfo_[i].FP;
+    buf[4 * i + 3] = statsInfo_[i].FN;
+  }
+  client->reduce(buf, buf, size, FLAGS_trainer_id, 0);
+  for (size_t i = 0; i < statsInfo_.size(); ++i) {
+    statsInfo_[i].TP = buf[4 * i + 0];
+    statsInfo_[i].TN = buf[4 * i + 1];
+    statsInfo_[i].FP = buf[4 * i + 2];
+    statsInfo_[i].FN = buf[4 * i + 3];
+  }
+  delete[] buf;
+}
+
+bool PrecisionRecallEvaluator::getStatsInfo(
+    PrecisionRecallEvaluator::PrintStatsInfo* info) const {
+  int label = config_.positive_label();
+  if (label != -1) {
+    CHECK(label >= 0 && label < (int)statsInfo_.size())
+        << "positive_label [" << label << "] should be in range [0, "
+        << statsInfo_.size() << ")";
+    info->precision = calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
+    info->recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
+    info->f1 = calcF1Score(info->precision, info->recall);
+    return false;
+  }
+
+  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
+  // macro average method: precision = (precision1+precision2)/2
+  double microTotalTP = 0;
+  double microTotalFP = 0;
+  double microTotalFN = 0;
+  info->macroAvgPrecision = 0;
+  info->macroAvgRecall = 0;
+  size_t numLabels = statsInfo_.size();
+  for (size_t i = 0; i < numLabels; ++i) {
+    microTotalTP += statsInfo_[i].TP;
+    microTotalFP += statsInfo_[i].FP;
+    microTotalFN += statsInfo_[i].FN;
+    info->macroAvgPrecision +=
+        calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
+    info->macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
+  }
+  info->macroAvgPrecision /= numLabels;
+  info->macroAvgRecall /= numLabels;
+  info->macroAvgF1Score =
+      calcF1Score(info->macroAvgPrecision, info->macroAvgRecall);
+
+  info->microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
+  info->microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
+  info->microAvgF1Score =
+      calcF1Score(info->microAvgPrecision, info->microAvgRecall);
+  return true;
+}
+
+REGISTER_EVALUATOR(pnpair, PnpairEvaluator);
+void PnpairEvaluator::start() {
+  Evaluator::start();
+  memset(pairArray_, 0, sizeof(pairArray_));
+  predictArray_.clear();
+}
+
+real PnpairEvaluator::evalImp(std::vector<Argument>& arguments) {
+  CHECK_GE(arguments.size(), 3UL);
+  CHECK_LE(arguments.size(), 4UL);
+  MatrixPtr output = arguments[0].value;
+  IVectorPtr label = arguments[1].ids;
+  IVectorPtr info = arguments[2].ids;
+  bool supportWeight = (4 == arguments.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? arguments[3].value : nullptr;
+  if (nullptr == output || nullptr == label ||
+      (supportWeight && nullptr == weight)) {
+    return 0;
+  }
+  size_t height = output->getHeight();
+  size_t width = output->getWidth();
+  CHECK_EQ(height, label->getSize());
+  CHECK_EQ(height, info->getSize());
+  if (supportWeight) {
+    CHECK_EQ(height, weight->getHeight());
+    CHECK_EQ((size_t)1, weight->getWidth());
+  }
+
+  if (dynamic_cast<GpuMatrix*>(output.get())) {
+    Matrix::resizeOrCreate(cpuOutput_, height, width, false, false);
+    IVector::resizeOrCreate(cpuLabel_, height, false);
+    IVector::resizeOrCreate(cpuInfo_, height, false);
+    cpuOutput_->copyFrom(*output);
+    cpuLabel_->copyFrom(*label);
+    cpuInfo_->copyFrom(*info);
+
+    output = cpuOutput_;
+    label = cpuLabel_;
+    info = cpuInfo_;
+
+    if (supportWeight) {
+      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
+      cpuWeight_->copyFrom(*weight);
+      weight = cpuWeight_;
+    }
+  }
+
+  real* outputs = output->getData();
+  int* labels = label->getData();
+  int* infos = info->getData();
+  real* weights = supportWeight ? weight->getData() : nullptr;
+  for (size_t i = 0; i < output->getHeight(); i++) {
+    real y1 = outputs[i * width + (width - 1)];
+    real w = supportWeight ? weights[i] : 1.0;
+    predictArray_.push_back(PredictionResult(y1, labels[i], infos[i], w));
+  }
+  return 0;
+}
+
+void PnpairEvaluator::stat(size_t start,
+                           size_t end,
+                           PredictionResult* answers,
+                           double& pos,
+                           double& neg,
+                           double& spe) {
+  for (size_t i = start; i < end; i++) {
+    for (size_t j = i + 1; j < end; j++) {
+      CHECK_EQ(answers[i].queryid, answers[j].queryid);
+      // The pair weight is the mean of the two samples' weight
+      double weight = (answers[i].weight + answers[j].weight) / 2.0;
+      if (answers[i].label != answers[j].label) {
+        if ((answers[i].out > answers[j].out &&
+             answers[i].label > answers[j].label) ||
+            (answers[i].out < answers[j].out &&
+             answers[i].label < answers[j].label)) {
+          pos += weight;
+        } else if ((answers[i].out > answers[j].out &&
+                    answers[i].label < answers[j].label) ||
+                   (answers[i].out < answers[j].out &&
+                    answers[i].label > answers[j].label)) {
+          neg += weight;
+        } else {
+          spe += weight;
+        }
+      }
+    }
+  }
+}
+
+void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
+  std::sort(predictArray.begin(),
+            predictArray.end(),
+            [](const PredictionResult& x, const PredictionResult& y) {
+              return x.queryid < y.queryid;
+            });
+
+  double pos = 0;
+  double neg = 0;
+  double special = 0;
+  auto start = predictArray.begin();
+  while (start != predictArray.end()) {
+    auto end = std::find_if(
+        start + 1, predictArray.end(), [=](const PredictionResult& x) {
+          return x.queryid != start->queryid;
+        });
+    CHECK(end != start);
+    stat(start - predictArray.begin(),
+         end - predictArray.begin(),
+         predictArray.data(),
+         pos,
+         neg,
+         special);
+
+    start = end;
+  }
+
+  pairArray_[0] += pos;
+  pairArray_[1] += neg;
+
+  LOG(INFO) << " calc total pos pair: " << pos
+            << " calc total neg pair: " << neg
+            << " calc total special pair: " << special;
+}
+
+std::string PnpairEvaluator::getTypeImpl() const { return "pnpair"; }
+
+ClassRegistrar<Evaluator> Evaluator::registrar_;
+Evaluator* Evaluator::create(const EvaluatorConfig& config) {
+  Evaluator* evaluator = registrar_.createByType(config.type());
+  evaluator->init(config);
+  return evaluator;
+}
+
+REGISTER_EVALUATOR(classification_error, ClassificationErrorEvaluator);
+REGISTER_EVALUATOR(sum, SumEvaluator);
+static InitFunction __reg_type_auc_sum__([]() {
+  Evaluator::registrar_.registerClass(
+      "last-column-sum", [] { return new ColumnSumEvaluator(-1); });
+  Evaluator::registrar_.registerClass("last-column-auc",
+                                      [] { return new AucEvaluator(-1); });
+});
+
+/**
+ * @brief print value of each layer.
+ *
+ * The config file api is value_printer_evaluator.
+ */
+class ValuePrinter : public NotGetableEvaluator {
+ public:
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
+                                                      "layer=" + name + " ");
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(value_printer, ValuePrinter);
+
+/**
+ * @brief print gradient of each layer.
+ *
+ * The config file api is gradient_printer_evaluator.
+ */
+class GradientPrinter : public NotGetableEvaluator {
+ public:
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      const Argument& argu = nn.getLayer(name)->getOutput();
+      if (argu.grad) {
+        std::ostringstream os;
+        argu.grad->print(os);
+        LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
+      }
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
+/**
+ * @brief print row max id vctor of each layer
+ *
+ * The config file api is maxid_printer_evaluator.
+ */
+class MaxIdPrinter : public NotGetableEvaluator {
+ private:
+  IVectorPtr maxIds_;
+  MatrixPtr maxValues_;
+
+ public:
+  MaxIdPrinter() {}
+
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      const Argument& argu = nn.getLayer(name)->getOutput();
+      if (argu.value) {
+        size_t height = argu.value->getHeight();
+        size_t width = config_.num_results();
+        IVector::resizeOrCreate(maxIds_, height * width, false);
+        Matrix::resizeOrCreate(maxValues_, height, width, false);
+        argu.value->rowMax(*maxIds_, *maxValues_);
+        std::ostringstream os;
+        int* ids = maxIds_->getData();
+        real* values = maxValues_->getData();
+        for (size_t i = 0; i < height; ++i) {
+          for (size_t j = 0; j < width; ++j) {
+            size_t pos = i * width + j;
+            os << ids[pos] << " : " << values[pos] << ", ";
+          }
+          os << std::endl;
+        }
+        LOG(INFO) << "layer=" << name << " row max id vector:\n" << os.str();
+      }
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
+/**
+ * @brief print sequence max frames of each layer
+ *
+ * The config file api is maxframe_printer_evaluator.
+ */
+class MaxFramePrinter : public NotGetableEvaluator {
+ private:
+  IVectorPtr maxIds_;
+  MatrixPtr maxValues_;
+  MatrixPtr value_;
+
+ public:
+  MaxFramePrinter() {
+    value_ =
+        Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, false);
+  }
+
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      const Argument& argu = nn.getLayer(name)->getOutput();
+
+      CHECK_EQ(argu.value->getWidth(), 1LU);
+      size_t numSequences = argu.getNumSequences();
+      const int* starts = argu.sequenceStartPositions->getData(false);
+
+      std::ostringstream os;
+      for (size_t i = 0; i < numSequences; ++i) {
+        size_t offset = starts[i];
+        size_t size = starts[i + 1] - starts[i];
+        value_->setData(argu.value->getData() + offset, 1LU, size);
+
+        size_t height = 1LU;
+        size_t width = std::min((size_t)config_.num_results(), size);
+        IVector::resizeOrCreate(maxIds_, height * width, false);
+        Matrix::resizeOrCreate(maxValues_, height, width, false);
+
+        value_->rowMax(*maxIds_, *maxValues_);
+
+        int* ids = maxIds_->getData();
+        real* values = maxValues_->getData();
+        for (size_t j = 0; j < width; ++j) {
+          os << ids[j] << " : " << values[j] << ", ";
+        }
+        os << "total " << size << " frames" << std::endl;
+      }
+      LOG(INFO) << "layer=" << name << " sequence max frames:\n" << os.str();
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
+
+/**
+ * @brief print text according to index matrix and a dictionary.
+ *
+ * There can be multiple input to this layer:
+ * - If there is only one input, the input must be a matrix containing
+ *      the sequence of indices;
+ * - If there are more than one input, the first input should be ids,
+ *      and are interpreted as sample ids.
+ *
+ * The output format will be:
+ *
+ * - sequence without sub-sequence, and there is probability.
+ *
+ *     @code
+ *      id \t prob space_seperated_tokens_from_dictionary_according_to_seq
+ *     @endcode
+ *
+ * - sequence without sub-sequence, and there is not probability.
+ *
+ *     @code
+ *      id \t space_seperated_tokens_from_dictionary_according_to_seq
+ *     @endcode
+ *
+ * - sequence with sub-sequence, and there is not probability.
+ *
+ *     @code
+ *      id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
+ *      \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
+ *      ...
+ *     @endcode
+ *
+ * Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
+ * with maxid (when generating) as an input.
+ *
+ * The config file api is seqtext_printer_evaluator.
+ *
+ */
+class SequenceTextPrinter : public NotGetableEvaluator {
+ private:
+  /// dict_file, which contains a list of tokens
+  std::vector<std::string> dict_;
+  /// result_file, which is the output file
+  std::ofstream os_;
+  /// True/False, to indicate whether to use space to separate output tokens.
+  /// Default is True. No space is added if set to False.
+  bool delimited_;
+  /// store the cpu version of argument.ids
+  std::vector<IVectorPtr> cpuIds_;
+  /// store the probability associated with each sequence
+  std::vector<MatrixPtr> cpuIn_;
+
+ public:
+  SequenceTextPrinter() {}
+
+  virtual void init(const EvaluatorConfig& config) {
+    Evaluator::init(config);
+    if (!config.dict_file().empty()) {
+      loadFileList(config.dict_file(), dict_);
+    }
+
+    os_.open(config.result_file(), std::ofstream::trunc);
+    CHECK(os_.is_open()) << "Failed to open file " << config.result_file();
+    delimited_ = config.delimited();
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    CHECK_GE(arguments.size(), 1LU);
+    bool hasId = arguments.size() > 1;
+    size_t numSequences = arguments[0].getNumSequences();
+    if (hasId) {
+      CHECK_EQ(arguments[0].ids->getSize(), numSequences)
+          << "first input must be sample id.";
+    }
+    for (size_t i = hasId ? 1 : 0; i < arguments.size(); ++i) {
+      CHECK_EQ((size_t)arguments[i].getNumSequences(), numSequences);
+    }
+
+    auto resizeVector = [](IVectorPtr& dest, const IVectorPtr& src) {
+      if (src && src->useGpu()) {
+        IVector::resizeOrCreate(dest, src->getSize(), false);
+        dest->copyFrom(*src);
+      } else {
+        dest = src;
+      }
+    };
+
+    auto resizeMatrix = [](MatrixPtr& dest, const MatrixPtr& src) {
+      if (src && src->useGpu()) {
+        Matrix::resizeOrCreate(
+            dest, src->getHeight(), src->getWidth(), false, false);
+        dest->copyFrom(*src);
+      } else {
+        dest = src;
+      }
+    };
+
+    cpuIds_.resize(arguments.size());
+    cpuIn_.resize(arguments.size());
+    for (size_t i = 0; i < arguments.size(); ++i) {
+      resizeVector(cpuIds_[i], arguments[i].ids);
+      resizeMatrix(cpuIn_[i], arguments[i].in);
+    }
+
+    int* sampleIds = nullptr;
+    if (hasId) {
+      sampleIds = cpuIds_[0]->getData();
+    }
+
+    for (size_t i = 0; i < numSequences; ++i) {
+      os_ << (hasId ? sampleIds[i] : i);
+      for (size_t j = hasId ? 1 : 0; j < arguments.size(); ++j) {
+        int* output = cpuIds_[j]->getData();
+        const int* starts = arguments[j].sequenceStartPositions->getData(false);
+
+        auto seqPrint = [&](int start, int end) {
+          os_ << "\t";
+          for (int k = start; k < end; k++) {
+            int id = output[k];
+            os_ << (delimited_ ? " " : "");
+            if (!dict_.empty()) {
+              CHECK_LT((size_t)id, dict_.size());
+              os_ << dict_[id];
+            } else {
+              os_ << id;
+            }
+          }
+        };
+
+        if (arguments[j].hasSubseq()) {
+          // print sequence with sub-sequence
+          const int* subStarts =
+              arguments[j].subSequenceStartPositions->getData(false);
+          int subSeqId_start = 0;
+          int subSeqId_end = 0;
+          for (size_t k = 0; k < (size_t)arguments[j].getNumSubSequences() + 1;
+               ++k) {
+            if (starts[i] == subStarts[k]) subSeqId_start = k;
+            if (starts[i + 1] == subStarts[k]) subSeqId_end = k;
+          }
+          for (int k = subSeqId_start; k < subSeqId_end; k++) {
+            seqPrint(subStarts[k], subStarts[k + 1]);
+            os_ << std::endl;
+          }
+
+        } else {
+          // print sequence without sub-sequence
+          if (arguments[j].in) {  // beam print
+            real* probs = cpuIn_[j]->rowBuf(i);
+            os_ << std::endl;
+            int start = starts[i];
+            int seqEnd = starts[i + 1];
+            for (size_t k = 0; k < arguments[j].in->getWidth(); ++k) {
+              if (start == seqEnd) {
+                break;
+              }
+              int end = start + output[start] + 2;
+              CHECK_LE(end, seqEnd);
+              CHECK_EQ(output[end - 1], -1);
+              os_ << k << "\t" << probs[k];
+              seqPrint(start + 1, end - 1);
+              os_ << std::endl;
+              start = end;
+            }
+          } else {
+            seqPrint(starts[i], starts[i + 1]);
+          }
+        }
+      }
+      os_ << std::endl;
+    }
+    return 0;
+  }
+};
+REGISTER_EVALUATOR(seq_text_printer, SequenceTextPrinter);
+/**
+ * @brief print classification error.
+ *
+ * The config file api is classification_error_printer_evaluator.
+ */
+class ClassificationErrorPrinter : public ClassificationErrorEvaluator {
+ public:
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    MatrixPtr errorMat = calcError(arguments);
+
+    std::ostringstream os;
+    errorMat->print(os);
+    LOG(INFO) << "Printer=" << config_.name() << " Classification Error:\n"
+              << os.str();
+
+    if (auto startPos = arguments[0].sequenceStartPositions) {
+      std::ostringstream os;
+      startPos->getVector(false)->print(os, startPos->getSize());
+      LOG(INFO) << "Printer=" << config_.name() << " sequence pos vector:\n"
+                << os.str();
+    }
+    return 0;
+  }
+};
+REGISTER_EVALUATOR(classification_error_printer, ClassificationErrorPrinter);
+
+std::string DummyEvaluator::getTypeImpl() const { return "dummy"; }
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/Evaluator.h b/paddle/legacy/gserver/evaluators/Evaluator.h
new file mode 100644
index 0000000000000000000000000000000000000000..90989bb0b6d49bc5bef3b5009d7179a52df7587e
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/Evaluator.h
@@ -0,0 +1,510 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/legacy/pserver/ParameterClient2.h"
+#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Error.h"
+
+namespace paddle {
+
+class NeuralNetwork;
+/**
+ * @def REGISTER_EVALUATOR
+ * @brief Macro for registering evaluator class
+ */
+
+#define REGISTER_EVALUATOR(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                \
+    Evaluator::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+/**
+ * @brief Base class for Evaluator
+ * Evaluating the performance of a model is very important.
+ * It indicates how successful the scores(predictions) of a datasets
+ * has been by a trained model.
+ */
+class Evaluator {
+ public:
+  static Evaluator* create(const EvaluatorConfig& config);
+
+  Evaluator() : numSamples_(0), totalScore_(0) {}
+
+  virtual ~Evaluator() {}
+
+  virtual void init(const EvaluatorConfig& config) { config_ = config; }
+
+  /**
+   * @brief start to evaluate some data
+   */
+  virtual void start() {
+    numSamples_ = 0;
+    totalScore_ = 0;
+  }
+
+  /**
+   * @brief Process a batch of data.
+   */
+  virtual void eval(const NeuralNetwork& nn);
+
+  /**
+   * @brief Process a batch of data.
+   * @return the score for the batch if it make sense to sum the score across
+   * batches.
+   * @note Otherwise evaluator should return 0 and override finish() and
+   * printStats() to do the right calculation.
+   */
+  virtual real evalImp(std::vector<Argument>& arguments) = 0;
+
+  /**
+   * @brief Update the number of processed samples
+   */
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    numSamples_ += arguments[0].getBatchSize();
+  }
+
+  /// finish() should be called before distributeEval
+  virtual void distributeEval(ParameterClient2* client) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  void mergeResultsOfAllClients(ParameterClient2* client) {
+    double data[2] = {totalScore_, numSamples_};
+    client->reduce(data, data, 2, FLAGS_trainer_id, 0);
+    totalScore_ = data[0];
+    numSamples_ = data[1];
+  }
+
+  /**
+   * @brief finish the evaluation.
+   */
+  virtual void finish() {}
+
+  /**
+   * @brief print the statistics of evaluate result
+   * @note finish() should be called before printStats
+   */
+  virtual void printStats(std::ostream& os) const {
+    os << config_.name() << "="
+       << (numSamples_ ? totalScore_ / numSamples_ : 0);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const Evaluator& evaluator) {
+    evaluator.printStats(os);
+    return os;
+  }
+
+  friend std::ostream&& operator<<(std::ostream&& os,  // NOLINT
+                                   const Evaluator& evaluator) {
+    evaluator.printStats(os);
+    return std::move(os);
+  }
+
+  static ClassRegistrar<Evaluator> registrar_;
+
+  /**
+   * @brief getNames will return all field names of current evaluator.
+   *
+   * The format of name is `evaluator_name.evaluator_fields`. If the evaluator
+   * has multiple field, the name could be `evaluator_name.field1`. For example
+   * the PrecisionRecallEvaluator contains `precision`, `recall` fields. The get
+   * names will return `precision_recall_evaluator.precision`,
+   * `precision_recall_evaluator.recal`, etc.
+   *
+   * Also, if current Evaluator is a combined evaluator. getNames will return
+   * all names of all evaluators inside the combined evaluator.
+   *
+   * @param names [out]: the field names of current evaluator.
+   * @note Never clear the names parameter inside getNames.
+   */
+  virtual void getNames(std::vector<std::string>* names) {
+    names->push_back(config_.name());
+  }
+
+  /**
+   * @brief getValue will return the current evaluate value of one field.
+   *
+   * @param name: The field name of current evaluator.
+   * @param err [out]: The error state.
+   *
+   * @return The evaluate value(metric).
+   */
+  virtual real getValue(const std::string& name, Error* err) const {
+    if (name != config_.name()) {
+      *err = Error("no such name of evaluator %s", name.c_str());
+      return .0f;
+    }
+    return this->getValueImpl();
+  }
+
+  /**
+   * @brief getType will return the evaluator type by field name.
+   *
+   * Evaluate Type is the current type of evaluator in string. Such as 'auc',
+   * 'precision_recall'. In combined evaluator, different name may get different
+   * evaluate type because it could be evaluated by different evaluator inside.
+   *
+   * @param name: The field name of current Evaluator.
+   * @param err: The error state. nullptr means don't care.
+   * @return the evaluator type string.
+   */
+  virtual std::string getType(const std::string& name, Error* err) const {
+    if (name != config_.name()) {
+      *err = Error("no such name of evaluator %s", name.c_str());
+      return std::string();
+    }
+    return this->getTypeImpl();
+  }
+
+ protected:
+  /**
+   * @brief getValueImpl The simplest way to define getValue result. If this
+   * evaluator doesn't contain multiple fields, and do not throw any error, just
+   * implemented this method to get the evaluate result(metric).
+   * @return Evaluate result(metric).
+   */
+  virtual real getValueImpl() const {
+    return numSamples_ != .0 ? totalScore_ / numSamples_ : .0;
+  }
+
+  /**
+   * @brief getTypeImpl The simplest way to define getType result. If this
+   * evaluator doesn't combine many evaluators, the get type should only return
+   * itself type.
+   * @return Evaluator type.
+   */
+  virtual std::string getTypeImpl() const { return "base"; }
+
+ protected:
+  EvaluatorConfig config_;
+  double numSamples_;
+  double totalScore_;
+};
+
+/**
+ * @brief The NotGetableEvaluator class is the base class of evaluator that
+ * cannot get value in runtime. The most NotGetableEvaluator is Printer
+ * Evaluator, which is only used to debug network configuration.
+ */
+class NotGetableEvaluator : public Evaluator {
+  // Evaluator interface
+ public:
+  void getNames(std::vector<std::string>* names) {}
+
+  real getValue(const std::string& name, Error* err) const {
+    *err = Error("Not implemented");
+    return .0f;
+  }
+
+  std::string getType(const std::string& name, Error* err) const {
+    *err = Error("Not implemented");
+    return "";
+  }
+};
+
+class DummyEvaluator : public Evaluator {
+ public:
+  DummyEvaluator() {}
+  virtual void init(const EvaluatorConfig&) {}
+  virtual void start() {}
+  virtual void eval(const NeuralNetwork&) {}
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    (void)arguments;
+    return -1;
+  }
+  virtual void finish() {}
+  virtual void printStats(std::ostream&) const {}
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const;
+};
+/**
+ * @brief evaluate AUC using colIdx-th column as prediction.
+ * The AUC(Area Under the Curve) is a common evaluation metric
+ * for binary classification problems. It computes the area under
+ * the receiver operating characteristic(ROC) curve.
+ *
+ * @note colIdx-th column
+ *
+ * - colIdx = 0: the 0-th column.
+ * - colIdx > 0: the colIdx-th column.
+ * - colIdx < 0: the last colIdx-th column.
+ *
+ * The config file api is auc_evaluator.
+ *
+ */
+class AucEvaluator : public Evaluator {
+ public:
+  AucEvaluator(int32_t colIdx)
+      : colIdx_(colIdx),
+        realColumnIdx_(0),
+        cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  virtual void start();
+
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  virtual void printStats(std::ostream& os) const {
+    os << config_.name() << "=" << calcAuc();
+  }
+
+  virtual void distributeEval(ParameterClient2* client);
+
+ private:
+  static const uint32_t kBinNum_ = (1 << 24) - 1;
+  static const int kNegativeLabel_ = 0;
+  double statPos_[kBinNum_ + 1];
+  double statNeg_[kBinNum_ + 1];
+  int32_t colIdx_;
+  uint32_t realColumnIdx_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+
+  AucEvaluator() {}
+
+  inline static double trapezoidArea(double X1,
+                                     double X2,
+                                     double Y1,
+                                     double Y2) {
+    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+  }
+
+  double calcAuc() const;
+
+  // Evaluator interface
+ protected:
+  real getValueImpl() const;
+  std::string getTypeImpl() const;
+};
+
+/**
+ * @brief RankAucEvaluator calculates the AUC of each list (i.e., titles
+ * under the same query), and averages them. Each list should be organized
+ * as a sequence. The inputs of this evaluator is [output, click, pv]. If pv
+ * is not provided, it will be set to 1. The types of click and pv are
+ * dense value.
+ */
+class RankAucEvaluator : public Evaluator {
+ public:
+  // evaluate ranking AUC
+  virtual void start();
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments);
+
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+
+ private:
+  MatrixPtr output_;
+  MatrixPtr click_;
+  MatrixPtr pv_;
+  std::vector<std::pair<real, int>> outputPair_;
+
+  double calcRankAuc(real* outputData,
+                     real* clickData,
+                     real* pvData,
+                     size_t size);
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const;
+};
+
+/**
+ * @brief precision, recall and f1 score Evaluator
+ * \f[
+ * precision = \frac{tp}{tp+tn} \\
+ * recall=\frac{tp}{tp+fn} \\
+ * f1=2*\frac{precsion*recall}{precision+recall}
+ * \f]
+ *
+ * The config file api is precision_recall_evaluator.
+ */
+class PrecisionRecallEvaluator : public Evaluator {
+ public:
+  // Evaluate precision, recall and F1 score
+  PrecisionRecallEvaluator()
+      : isMultiBinaryLabel_(false),
+        cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  virtual void start();
+
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  virtual void printStats(std::ostream& os) const;
+
+  virtual void distributeEval(ParameterClient2* client);
+
+  void getNames(std::vector<std::string>* names);
+
+  real getValue(const std::string& name, Error* err) const;
+
+  std::string getType(const std::string& name, Error* err) const;
+
+  struct StatsInfo {
+    /// numbers of true positives
+    double TP;
+    /// numbers of true negatives
+    double TN;
+    /// numbers of false positives
+    double FP;
+    /// numbers of false negatives
+    double FN;
+
+    StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
+  };
+
+ private:
+  bool isMultiBinaryLabel_;
+  std::vector<StatsInfo> statsInfo_;
+
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+
+  struct PrintStatsInfo {
+    double precision;
+    double recall;
+    double f1;
+    double macroAvgPrecision;
+    double macroAvgRecall;
+    double macroAvgF1Score;
+    double microAvgPrecision;
+    double microAvgRecall;
+    double microAvgF1Score;
+  };
+
+  bool getStatsInfo(PrintStatsInfo* info) const;
+
+  void calcStatsInfo(const MatrixPtr& output,
+                     const IVectorPtr& label,
+                     const MatrixPtr& weight);
+
+  void calcStatsInfoMulti(const MatrixPtr& output,
+                          const MatrixPtr& label,
+                          const MatrixPtr& weight);
+
+  inline static double calcPrecision(double TP, double FP) {
+    if (TP > 0.0 || FP > 0.0) {
+      return TP / (TP + FP);
+    } else {
+      return 1.0;
+    }
+  }
+
+  inline static double calcRecall(double TP, double FN) {
+    if (TP > 0.0 || FN > 0.0) {
+      return TP / (TP + FN);
+    } else {
+      return 1.0;
+    }
+  }
+
+  inline static double calcF1Score(double precision, double recall) {
+    if (precision > 0.0 || recall > 0.0) {
+      return 2 * precision * recall / (precision + recall);
+    } else {
+      return 0;
+    }
+  }
+
+  mutable std::unordered_map<std::string, real> values_;
+
+  void storeLocalValues() const;
+};
+
+/*
+ * @brief positive-negative pair rate Evaluator
+ *
+ * The config file api is pnpair_evaluator.
+ */
+class PnpairEvaluator : public Evaluator {
+ public:
+  PnpairEvaluator()
+      : cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuInfo_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  virtual void start();
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  struct PredictionResult {
+    PredictionResult(real __out, int __label, int __queryid, real __weight)
+        : out(__out), label(__label), queryid(__queryid), weight(__weight) {}
+    real out;
+    int label;
+    int queryid;
+    real weight;
+  };
+  std::vector<PredictionResult> predictArray_;
+  void printPredictResults() {
+    std::ofstream fs(FLAGS_predict_file);
+    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
+    for (auto& res : predictArray_) {
+      fs << res.out << " " << res.label << " " << res.queryid << std::endl;
+    }
+  }
+
+  void stat(size_t start,
+            size_t end,
+            PredictionResult* answers,
+            double& pos,
+            double& neg,
+            double& spe);
+  void calc(std::vector<PredictionResult>& predictArray);
+
+  virtual void finish() { calc(predictArray_); }
+
+  virtual void printStats(std::ostream& os) const {
+    os << " pos/neg=" << this->getValueImpl();
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    client->reduce(pairArray_, pairArray_, kPairArrayNum_, FLAGS_trainer_id, 0);
+    LOG(INFO) << " distribute eval calc total pos pair: " << pairArray_[0]
+              << " calc total neg pair: " << pairArray_[1];
+  }
+
+ private:
+  static const uint32_t kPairArrayNum_ = 2;
+  double pairArray_[kPairArrayNum_];
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  IVectorPtr cpuInfo_;
+  MatrixPtr cpuWeight_;
+
+  // Evaluator interface
+ protected:
+  real getValueImpl() const {
+    return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
+  }
+  std::string getTypeImpl() const;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
similarity index 100%
rename from paddle/gserver/gradientmachines/GradientMachine.cpp
rename to paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachine.h b/paddle/legacy/gserver/gradientmachines/GradientMachine.h
new file mode 100644
index 0000000000000000000000000000000000000000..48f5141ce1ba7865ff63e489c31468c82df99afd
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/GradientMachine.h
@@ -0,0 +1,250 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "ModelConfig.pb.h"
+#include "TrainerConfig.pb.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
+#include "paddle/utils/Thread.h"
+
+#ifndef PADDLE_MOBILE_INFERENCE
+#include "paddle/legacy/gserver/evaluators/Evaluator.h"
+#endif
+
+namespace paddle {
+/**
+ * @brief A gradient machine is capable of calculating some outputs given
+ *        some inputs and performing gradient calculation based on the
+ *        derivative from the outputs.
+ *
+ * A gradient machine can be either a full neural network or part of a neural
+ * network.
+ *
+ * Usage for training:
+ *
+ *  1. Prepare inArgs. Put your input data into inArgs[i].value.
+ *
+ *  2. Call forward(inArgs, &outArgs)
+ *
+ *  3. Calculate gradient with respect to outArgs[i]->value
+ *     and fill them into outArgs[i]->grad.
+ *     This step can be skipped if your the outputs are from cost layers.
+ *
+ *  4. Call backward(). After backward, gradient of each parameter is
+ *     accumulated to getParameters()[i]->getBuf(PARAMETER_GRADIENT)
+ *
+ *  5. Update parameter value getParameters()[i]->getBuf(PARAMETER_VALUE) using
+ *     gradients.
+ *
+ *  6. Clear gradients to zero.
+ *
+ * Usage for prediction:
+ *
+ *  1. Prepare inArgs. Put your input data into inArgs[i].value.
+ *
+ *  2. Call forward(inArgs, &outArgs)
+ *
+ *  3. Obtain the prediction result from outArgs[i]
+ */
+
+typedef std::vector<LayerStatePtr> MachineState;
+
+class GradientMachine;
+
+typedef std::shared_ptr<GradientMachine> GradientMachinePtr;
+
+class GradientMachine {
+ public:
+  enum CreateMode {
+    kNormal = 0,
+    kSgdSparseCpuTraining = 3,
+    kTesting = 4,
+    kCustom = 10
+  };
+
+  /**
+   * Create a gradient machine from ModelConfig
+   * Parameter will have parameterTypes
+   */
+  static GradientMachine* create(
+      const ModelConfig& config,
+      int mode = kNormal,
+      const std::vector<ParameterType>& parameterTypes =
+          std::vector<ParameterType>{
+              PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM});
+
+  virtual ~GradientMachine() {}
+
+  /**
+   * Prefetch row ids of sparse parameter.
+   */
+  virtual void prefetch(const std::vector<Argument>& inArgs) { (void)inArgs; }
+
+  /**
+   * @brief Forward propagation.
+   *
+   * Calculate outputs (outArgs) based the inputs (inArgs)
+   *
+   * @note: if passType==PASS_TEST, then backward() should not be called
+   */
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType) = 0;
+
+  /**
+   * @brief Backward propagation.
+   *
+   * Calculate the gradient of inArgs and parameter.
+   *
+   * This function should only be called after a corresponding forward() call.
+   * The caller is responsible for filling the correct grad for the outArgs
+   * obtained using forward().
+   *
+   * It may also change the grad field for the inArgs supplied at forward()
+   */
+  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
+
+  /**
+   * Combine forward() and backward(). For multithread training, this
+   * may be faster.
+   *
+   * @note: passType PASS_TEST is not allowed for forwardBackward().
+   */
+  virtual void forwardBackward(const std::vector<Argument>& inArgs,
+                               std::vector<Argument>* outArgs,
+                               PassType passType,
+                               const UpdateCallback& callback = nullptr) {
+    forward(inArgs, outArgs, passType);
+    backward(callback);
+  }
+
+  virtual Argument getLayerOutput(const std::string& layerName) = 0;
+
+  // see comment in Layer.h for the function with the same name
+  virtual void resetState() {}
+
+  // set machine state
+  virtual void setState(const MachineState& machineState) {}
+
+  // save machine state
+  virtual void getState(MachineState& machineState) {}
+
+  virtual void onPassEnd() = 0;
+
+#ifndef PADDLE_MOBILE_INFERENCE
+  /**
+   * Create an evaluator which can be used for eval()
+   */
+  virtual Evaluator* makeEvaluator() const = 0;
+
+  /**
+   * evaluate using the given evaluator
+   */
+  virtual void eval(Evaluator* evaluator) const = 0;
+#endif
+
+  std::vector<ParameterPtr>& getParameters() { return parameters_; }
+
+  std::vector<ParameterPtr>& getNonStaticParameters() {
+    if (nonStaticParameters_.empty()) {
+      for (auto para : parameters_) {
+        if (!para->isStatic()) {
+          nonStaticParameters_.push_back(para);
+        }
+      }
+    }
+    return nonStaticParameters_;
+  }
+
+  inline bool hasStaticParameters() {
+    return parameters_.size() != getNonStaticParameters().size();
+  }
+
+  /**
+   * @brief   Used before formal training, start work-threads and set
+   *          trainer Parameters;
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual void start() {}
+
+  /**
+   * @brief   check  each work-thread whether is failed/error/finish,
+   *          if not, return ture, and yes return false.
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual void finish() {}
+
+  /**
+   * @brief   set the training status a "finished" value, the sub_work_threads
+   *          will option the change, and then exit.
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual bool trainIsOn() { return true; }
+
+  /**
+   * @brief   when all or some of the sub-workThreads are suspended to waiting
+   *          controller's instructions, and after some processing done in the
+   *          controller, it will call this function to wake up all the pending
+   *          thread.
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual void restart() {}
+
+  /// Set the gradient of the output from outside.
+  virtual void setOutputGrad(const std::vector<Argument>& args) {
+    LOG(FATAL) << "Not implemented!";
+  }
+
+  void saveParameters(const std::string& dir) const;
+
+  void loadParameters(const std::string& dir);
+
+  void randParameters();
+
+  virtual void getStats(real& cost, int64_t& numProcessed) {
+    (void)cost;
+    (void)numProcessed;
+  }
+
+  /**
+   * @brief   Release the middle layer's output memory.
+   *
+   * @note    This function is used for memory optimization in inference.
+   */
+  virtual void releaseOutput() {}
+
+ protected:
+  virtual void onLoadParameter() {}
+
+  std::vector<ParameterPtr> parameters_;
+  std::vector<ParameterPtr> nonStaticParameters_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.cpp b/paddle/legacy/gserver/gradientmachines/GradientMachineMode.cpp
similarity index 100%
rename from paddle/gserver/gradientmachines/GradientMachineMode.cpp
rename to paddle/legacy/gserver/gradientmachines/GradientMachineMode.cpp
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.h b/paddle/legacy/gserver/gradientmachines/GradientMachineMode.h
similarity index 100%
rename from paddle/gserver/gradientmachines/GradientMachineMode.h
rename to paddle/legacy/gserver/gradientmachines/GradientMachineMode.h
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
similarity index 100%
rename from paddle/gserver/gradientmachines/MultiGradientMachine.cpp
rename to paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
similarity index 100%
rename from paddle/gserver/gradientmachines/MultiGradientMachine.h
rename to paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
similarity index 100%
rename from paddle/gserver/gradientmachines/MultiNetwork.cpp
rename to paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.h b/paddle/legacy/gserver/gradientmachines/MultiNetwork.h
similarity index 100%
rename from paddle/gserver/gradientmachines/MultiNetwork.h
rename to paddle/legacy/gserver/gradientmachines/MultiNetwork.h
diff --git a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..339550c458f5e79fb1afa79952ffd373c2850ec4
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
@@ -0,0 +1,548 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Util.h"
+
+#include "NeuralNetwork.h"
+#include "hl_gpu.h"
+#include "paddle/utils/CustomStackTrace.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
+#endif
+
+#ifndef PADDLE_MOBILE_INFERENCE
+#include "MultiNetwork.h"
+#include "RecurrentGradientMachine.h"
+#include "paddle/legacy/gserver/layers/AgentLayer.h"
+#endif
+
+namespace paddle {
+void parameterInitNN(int paramId,
+                     Parameter* para,
+                     std::vector<ParameterPtr>* sharedParams) {
+  // Create parameters values.
+  if (!para->useGpu() && sharedParams) {
+    para->enableSharedType(PARAMETER_VALUE,
+                           (*sharedParams)[paramId]->getBuf(PARAMETER_VALUE),
+                           (*sharedParams)[paramId]->getMat(PARAMETER_VALUE));
+  } else {
+    if (para->isSparseRemoteUpdate()) {
+      para->enableType(PARAMETER_VALUE,
+                       FLAGS_loadsave_parameters_in_pserver
+                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
+                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
+    } else {
+      para->enableType(PARAMETER_VALUE);
+    }
+  }
+  // Create parameter gradients.
+  if (para->isSparseRemoteUpdate() && !sharedParams) {
+    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
+  } else if (para->isGradSparseUpdate()) {
+    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_AUTO_GROW);
+  } else if (!para->isStatic()) {
+    para->enableType(PARAMETER_GRADIENT);
+  }
+}
+
+NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
+#ifndef PADDLE_MOBILE_INFERENCE
+  if (config.type() == "recurrent_nn") {
+    return newNeuralNetwork("root");
+  } else if (config.type() == "multi_nn") {
+    return new MultiNetwork("root");
+  } else {
+    return newNeuralNetwork();
+  }
+#else
+  return new NeuralNetwork();
+#endif
+}
+
+std::map<std::string, bool> NeuralNetwork::dllInitMap;
+
+void NeuralNetwork::init(const ModelConfig& config,
+                         ParamInitCallback callback,
+                         const std::vector<ParameterType>& parameterTypes,
+                         bool useGpu) {
+  using std::placeholders::_1;
+  using std::placeholders::_2;
+  ParamInitCallback paramCallback = nullptr;
+  if (callback != nullptr) {
+    paramSelfInited_ = false;
+    paramCallback = callback;
+  } else {
+    paramSelfInited_ = true;
+    paramCallback = std::bind(parameterInitNN, _1, _2, nullptr);
+  }
+  config_ = config;
+
+  if (rootNetwork_ != nullptr) {
+    // direct use parameters_ and parameterMap_ from base network
+    CHECK_EQ((size_t)config.parameters_size(),
+             rootNetwork_->getParameters().size());
+    parameters_ = rootNetwork_->getParameters();
+    parameterMap_ = *(rootNetwork_->getParameterMap());
+  } else {
+    parameters_.reserve(config.parameters_size());
+    for (const auto& para_config : config.parameters()) {
+      auto parameter = std::make_shared<Parameter>(para_config,
+                                                   useGpu,
+                                                   /*initialize=*/false);
+      paramCallback(parameters_.size(), parameter.get());
+      if (!callback) {
+        for (ParameterType type :
+             (parameter->isStatic()
+                  ? std::vector<ParameterType>{PARAMETER_VALUE}
+                  : parameterTypes)) {
+          if (type != PARAMETER_VALUE && type != PARAMETER_GRADIENT) {
+            parameter->enableType(type);
+          }
+        }
+      }
+      parameter->setID(parameters_.size());
+      parameters_.push_back(parameter);
+      CHECK(!parameterMap_.count(parameter->getName()));
+      parameterMap_[parameter->getName()] = parameter;
+    }
+  }
+
+  auto layerCreate = [&](const LayerConfig& layer_config) {
+    auto layer = Layer::create(layer_config);
+    CHECK(layer) << "Create layer failed. Layer name:" << layer->getName();
+    layers_.push_back(layer);
+    CHECK(!layerMap_.count(layer->getName()));
+    layerMap_[layer->getName()] = layer;
+  };
+
+  auto subModelConfig = std::find_if(config.sub_models().begin(),
+                                     config.sub_models().end(),
+                                     [=](const SubModelConfig& sub_model) {
+                                       return sub_model.name() == subModelName_;
+                                     });
+  bool useSubModel = (subModelConfig != config.sub_models().end());
+  CHECK_EQ(useSubModel, !subModelName_.empty());
+  if (useSubModel) {
+    layers_.reserve(subModelConfig->layer_names_size());
+    for (const auto& layer_name : subModelConfig->layer_names()) {
+      auto layer_config =
+          std::find_if(config.layers().begin(),
+                       config.layers().end(),
+                       [=](const LayerConfig& layer_config) {
+                         return layer_config.name() == layer_name;
+                       });
+      CHECK(layer_config != config.layers().end());
+      layerCreate(*layer_config);
+    }
+  } else {
+    layers_.reserve(config.layers_size());
+    for (const auto& layer_config : config.layers()) {
+      bool useLayer = true;
+      if (config.has_external_config()) {
+        useLayer = true;
+        for (const auto& name : config.external_config().layer_names()) {
+          if (layer_config.name() == name) {
+            useLayer = false;
+            break;
+          }
+        }
+      }
+      if (useLayer) {
+        layerCreate(layer_config);
+      }
+    }
+  }
+
+  for (const auto& layer : layers_) {
+    layer->init(layerMap_, parameterMap_);
+    layer->initSubNetwork(this /*root*/, config_, parameterTypes, useGpu);
+  }
+
+  for (const auto& layer_name :
+       (useSubModel ? subModelConfig->input_layer_names()
+                    : config.input_layer_names())) {
+    auto it = layerMap_.find(layer_name);
+    CHECK(it != layerMap_.end());
+    dataLayers_.push_back(std::dynamic_pointer_cast<DataLayer>(it->second));
+  }
+
+  for (const auto& layer_name :
+       (useSubModel ? subModelConfig->output_layer_names()
+                    : config.output_layer_names())) {
+    auto it = layerMap_.find(layer_name);
+    CHECK(it != layerMap_.end());
+    outputLayers_.push_back(it->second);
+  }
+
+  for (const auto& layer : layers_) {
+    const auto& name = layer->getName();
+    bool isMiddleLayer = true;
+
+    // if data layer
+    for (const auto& dataLayer : dataLayers_) {
+      if (name == dataLayer->getName()) {
+        isMiddleLayer = false;
+        break;
+      }
+    }
+
+    // if output layer
+    for (const auto& dataLayer : outputLayers_) {
+      if (name == dataLayer->getName()) {
+        isMiddleLayer = false;
+        break;
+      }
+    }
+
+    if (isMiddleLayer) {
+      middleLayers_.push_back(layer);
+    }
+  }
+}
+
+void NeuralNetwork::connect(LayerPtr agentLayer,
+                            LayerPtr realLayer,
+                            int height) {
+#ifndef PADDLE_MOBILE_INFERENCE
+  AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
+  CHECK_NOTNULL(agent);
+  agent->setRealLayer(realLayer, height);
+#endif
+}
+
+void NeuralNetwork::connect(std::string agentLayerName,
+                            NeuralNetwork* srcNN,
+                            std::string realLayerName) {
+  connect(this->getLayer(agentLayerName), srcNN->getLayer(realLayerName));
+}
+
+void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
+  CHECK_EQ(inArgs.size(), dataLayers_.size());
+
+  if (paramSelfInited_) {
+    for (auto& para : parameters_) {
+      if (para->isSparseRemoteUpdate()) {
+        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
+            para->getMat(PARAMETER_VALUE).get());
+        para->clearGradient();
+        if (mat) mat->clearIndices();
+      }
+    }
+  }
+
+  for (size_t i = 0; i != dataLayers_.size(); ++i) {
+    if (FLAGS_parallel_nn) {
+      const_cast<Argument&>(inArgs[i]).deviceId = -1;
+    }
+    dataLayers_[i]->setData(inArgs[i]);
+  }
+
+  for (auto& layer : layers_) {
+    layer->prefetch();
+  }
+
+  if (paramSelfInited_) {
+    for (auto& para : parameters_) {
+      if (para->isSparseRemoteUpdate()) {
+        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
+            para->getMat(PARAMETER_VALUE).get());
+        mat->setupIndices();
+        auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
+            para->getMat(PARAMETER_GRADIENT).get());
+        matGrad->reserveStore();
+      }
+    }
+  }
+}
+
+void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
+                            std::vector<Argument>* outArgs,
+                            PassType passType) {
+  CHECK_EQ(inArgs.size(), dataLayers_.size());
+  outArgs->resize(outputLayers_.size());
+  for (size_t i = 0; i != dataLayers_.size(); ++i) {
+    dataLayers_[i]->setData(inArgs[i]);
+  }
+
+  gLayerStackTrace.set_stage(true);
+
+  {
+    for (auto& layer : layers_) {
+      REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str());
+      gLayerStackTrace.push(layer->getName());
+      layer->forward(passType);
+      gLayerStackTrace.pop(layer->getName());
+    }
+  }
+
+  outArgs->clear();
+  outArgs->reserve(outputLayers_.size());
+  for (auto& layer : outputLayers_) {
+    outArgs->push_back(layer->getOutput());
+  }
+}
+
+void NeuralNetwork::resetState() {
+  for (auto& layer : layers_) {
+    layer->resetState();
+  }
+}
+
+void NeuralNetwork::setState(const MachineState& machineState) {
+  for (size_t i = 0; i < layers_.size(); i++) {
+    if (machineState[i] != nullptr) {
+      layers_[i]->setState(machineState[i]);
+    }
+  }
+}
+
+void NeuralNetwork::getState(MachineState& machineState) {
+  machineState.clear();
+  machineState.reserve(layers_.size());
+  for (auto& layer : layers_) {
+    LayerStatePtr p = layer->getState();
+    machineState.push_back(p);
+  }
+}
+
+void NeuralNetwork::backward(const UpdateCallback& callback) {
+  gLayerStackTrace.set_stage(false);
+  FOR_EACH_R(layer, layers_) {
+    REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
+    gLayerStackTrace.push((*layer)->getName());
+    if ((*layer)->needGradient()) {
+      (*layer)->backward(callback);
+    }
+    gLayerStackTrace.pop((*layer)->getName());
+  }
+}
+
+void NeuralNetwork::finish() {
+#ifdef PADDLE_WITH_MKLDNN
+  FOR_EACH_R(layer, layers_) {
+    MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
+    if (dnnLayer) {
+      dnnLayer->convertWeightsToPaddle();
+    }
+  }
+#endif
+}
+
+Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
+  return getLayer(layerName)->getOutput();
+}
+
+void NeuralNetwork::onPassEnd() {
+  for (auto& layer : layers_) {
+    layer->onPassEnd();
+  }
+}
+
+void NeuralNetwork::releaseOutput() {
+  for (auto& layer : middleLayers_) {
+    Argument& arg = layer->getOutput();
+    arg.value.reset();
+  }
+}
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
+class CombinedEvaluator : public Evaluator {
+ public:
+  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
+    evaluators_.emplace_back(std::move(evaluator));
+  }
+  void start() override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->start();
+    }
+  }
+
+  void finish() override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->finish();
+    }
+  }
+
+  void eval(const NeuralNetwork& nn) override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->eval(nn);
+    }
+  }
+  real evalImp(std::vector<Argument>& arguments) override {
+    (void)arguments;
+    return -1;
+  }
+  void printStats(std::ostream& os) const override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->printStats(os);
+      os << ' ';
+    }
+  }
+
+  void distributeEval(ParameterClient2* client) override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->distributeEval(client);
+    }
+  }
+
+ protected:
+  std::vector<std::unique_ptr<Evaluator>> evaluators_;
+
+  // Evaluator interface
+ public:
+  /**
+   * @brief getNames will return all inside evaluators' names.
+   * @param names [out]: return names.
+   */
+  void getNames(std::vector<std::string>* names) override {
+    for (auto& eval : evaluators_) {
+      eval->getNames(names);
+    }
+  }
+
+  /**
+   * @brief getValue could get all inside evaluators' value.
+   */
+  real getValue(const std::string& name, Error* err) const override {
+    return this->getMethodHelper<real>(
+        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
+          return eval->getValue(name, err);
+        });
+  }
+
+  /**
+   * @brief getType could get all inside evaluators' type.
+   */
+  std::string getType(const std::string& name, Error* err) const override {
+    return this->getMethodHelper<std::string>(
+        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
+          return eval->getType(name, err);
+        });
+  }
+
+ private:
+  template <typename T>
+  T getMethodHelper(const std::string& name,
+                    Error* err,
+                    const std::function<T(const std::unique_ptr<Evaluator>&)>&
+                        callback) const {
+    for (auto& eval : evaluators_) {
+      std::vector<std::string> names;
+      eval->getNames(&names);
+      if (std::find(names.begin(), names.end(), name) != names.end()) {
+        return callback(eval);
+      }
+    }
+    *err = Error("No such key %s", name.c_str());
+    return T();
+  }
+};
+
+class SubnetEvaluator : public CombinedEvaluator {
+ public:
+  SubnetEvaluator(const std::string& layerName,
+                  std::unique_ptr<Evaluator>&& evaluator)
+      : layerName_(layerName) {
+    addEvaluator(std::move(evaluator));
+  }
+  void eval(const NeuralNetwork& nn) override {
+    const LayerPtr& layer = nn.getLayer(layerName_);
+    CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
+                 << nn.getName();
+    bool accessed = false;
+    layer->accessSubNetwork([this, &accessed](NeuralNetwork& subnet) {
+      subnet.eval(evaluators_[0].get());
+      accessed = true;
+    });
+    CHECK(accessed) << "There is no subnetwork for layer " << layerName_
+                    << " in submodel " << nn.getName();
+  }
+
+ protected:
+  std::string layerName_;
+};
+
+Evaluator* NeuralNetwork::makeEvaluator() const {
+  CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
+  auto subModelConfig = std::find_if(config_.sub_models().begin(),
+                                     config_.sub_models().end(),
+                                     [=](const SubModelConfig& sub_model) {
+                                       return sub_model.name() == subModelName_;
+                                     });
+  bool useSubModel = (subModelConfig != config_.sub_models().end());
+  CHECK_EQ(useSubModel, !subModelName_.empty());
+  if (useSubModel) {
+    // create the evaluators that belong to CURRENT submodel
+    for (int i = 0; i < subModelConfig->evaluator_names_size(); ++i) {
+      // find evaluator by name
+      auto thisEvalConfig = std::find_if(
+          config_.evaluators().begin(),
+          config_.evaluators().end(),
+          [=](const EvaluatorConfig& ecfg) {
+            return ecfg.name() == subModelConfig->evaluator_names(i);
+          });
+      bool validConfig = (thisEvalConfig != config_.evaluators().end());
+      if (validConfig) {
+        std::unique_ptr<Evaluator> evaluator(
+            Evaluator::create(*thisEvalConfig));
+        combinedEvaluator->addEvaluator(std::move(evaluator));
+      }
+    }
+    for (auto& layer : layers_) {
+      layer->accessSubNetwork(
+          [layer, combinedEvaluator](NeuralNetwork& subnet) {
+            std::unique_ptr<Evaluator> subEvaluator(new SubnetEvaluator(
+                layer->getName(),
+                std::unique_ptr<Evaluator>(subnet.makeEvaluator())));
+            combinedEvaluator->addEvaluator(std::move(subEvaluator));
+          });
+    }
+  } else {
+    for (const EvaluatorConfig& evalConfig : config_.evaluators()) {
+      std::unique_ptr<Evaluator> evaluator(Evaluator::create(evalConfig));
+      combinedEvaluator->addEvaluator(std::move(evaluator));
+    }
+  }
+  return combinedEvaluator;
+}
+
+void NeuralNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
+
+#endif
+
+void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
+  CHECK_GE(outputLayers_.size(), args.size());
+  for (size_t i = 0; i < args.size(); ++i) {
+    outputLayers_[i]->getOutput().grad = args[i].grad;
+  }
+}
+
+extern NeuralNetwork* newCustomNerualNetwork(const std::string& name,
+                                             NeuralNetwork* network)
+    __attribute__((weak));
+
+NeuralNetwork* NeuralNetwork::newNeuralNetwork(const std::string& name,
+                                               NeuralNetwork* rootNetwork) {
+  if (newCustomNerualNetwork) {
+    return newCustomNerualNetwork(name, rootNetwork);
+  } else {
+    return new NeuralNetwork(name, rootNetwork);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a0909b99b3aaf389b8c6457de7b37cbe00bfae7
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
@@ -0,0 +1,179 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <map>
+#include <memory>
+
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/gserver/layers/CostLayer.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/utils/ClassRegistrar.h"
+
+namespace paddle {
+/*
+ * @brief  Init function for the parameters.
+ * @param paramId: the id of the parameter to init.
+ * @param para: the pointer to the parameter to init.
+ * @param sharedParams: the pointer to an array of the parameter to be shared.
+ *                      If it is null, no parameter sharing is used.
+ *                      Only CPU paramters can be shared.
+ * It handles CPU, CPU sparse, CPU sparse remote,
+ * and GPU parameters differently. If the type
+ * of a parameter is NORMAL. Basically nothing need to be done.
+ * CPU value: NORMAL.
+ * CPU param: NORMAL.
+ *
+ * CPU sparse value: NORMAL.
+ * CPU sparse gradient: MAT_SPARSE_ROW_AUTO_GROW.
+ *
+ * CPU sparse remote value: MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE).
+ * CPU sparse remote gradient: MAT_SPARSE_ROW (!sharedParams)
+ *                             MAT_SPARSE_ROW_AUTO_GROW (sharedParams)
+ *
+ * GPU value: NORMAL
+ * GPU param: NORMAL
+ */
+void parameterInitNN(int paramId,
+                     Parameter* para,
+                     std::vector<ParameterPtr>* sharedParams);
+
+class NeuralNetwork : public GradientMachine {
+ public:
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback = nullptr,
+                    const std::vector<ParameterType>& parameterTypes =
+                        std::vector<ParameterType>{PARAMETER_VALUE,
+                                                   PARAMETER_GRADIENT,
+                                                   PARAMETER_MOMENTUM},
+                    bool useGpu = FLAGS_use_gpu);
+
+  /**
+   * Connect two submodels and
+   * down-submodel's output become up-submodel's input.
+   * By default, connection is one by one,
+   * If the agent height is smaller than real layer, *height* has to be filled.
+   *
+   * @param realLayer  The down-submodel's output layer.
+   * @param agentLayer The up-submodel's input agent layer.
+   */
+  static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
+  void connect(std::string agentLayerName,
+               NeuralNetwork* srcNN,
+               std::string realLayerName);
+
+  virtual void prefetch(const std::vector<Argument>& inArgs);
+
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  virtual Argument getLayerOutput(const std::string& layerName);
+
+  const LayerPtr& getLayer(const std::string& layerName) const {
+    auto it = layerMap_.find(layerName);
+    CHECK(it != layerMap_.end()) << "Unknown layer " << layerName;
+    return it->second;
+  }
+
+  virtual void onPassEnd();
+
+#ifndef PADDLE_MOBILE_INFERENCE
+  virtual Evaluator* makeEvaluator() const;
+
+  virtual void eval(Evaluator* evaluator) const;
+#endif
+
+  virtual void resetState();
+  virtual void setOutputGrad(const std::vector<Argument>& args);
+
+  /// set machine state
+  virtual void setState(const MachineState& machineState);
+
+  /// get machine state
+  virtual void getState(MachineState& machineState);
+
+  static NeuralNetwork* create(const ModelConfig& config);
+
+  ParameterMap* getParameterMap() { return &parameterMap_; }
+
+  /**
+   * @brief Access each layer as a for each loop.
+   * @param callback invoke with each layer.
+   */
+  template <typename T>
+  void forEachLayer(T callback) {
+    for (auto& l : layers_) {
+      if (callback(l)) {
+        break;
+      }
+    }
+  }
+
+  static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
+                                         NeuralNetwork* rootNetwork = nullptr);
+
+  const std::string& getName() const { return subModelName_; }
+
+  /// some finish work, like convert the weight format of MKLDNNLayers
+  void finish();
+
+  /**
+   * @brief   Release the middle layer's output memory.
+   *
+   * @note    This function is used for memory optimization in inference.
+   */
+  void releaseOutput();
+
+ protected:
+  /**
+   * The constructor of NeuralNetwork.
+   * The sub networks can get parameters_ and parameterMap_
+   * from base NeuralNetwork.
+   *
+   * @param subModelName The name of sub-model.
+   * @param rootNetwork  It used in MultiNetwork.
+   */
+  NeuralNetwork(std::string subModelName = "",
+                NeuralNetwork* rootNetwork = nullptr)
+      : subModelName_(subModelName), rootNetwork_(rootNetwork) {}
+
+  std::string subModelName_;
+  ModelConfig config_;
+  std::vector<LayerPtr> layers_;
+  ParameterMap parameterMap_;
+  LayerMap layerMap_;
+
+  std::vector<DataLayerPtr> dataLayers_;
+  std::vector<LayerPtr> outputLayers_;
+  std::vector<LayerPtr> middleLayers_;
+
+  static std::map<std::string, bool> dllInitMap;
+
+  NeuralNetwork* rootNetwork_;
+
+  /// Whether parameter of this NN is initialized by its own
+  /// (i.e., not by callback supplied with the caller)
+  bool paramSelfInited_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
similarity index 100%
rename from paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
rename to paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h
similarity index 100%
rename from paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
rename to paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h
diff --git a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e749cf61f304881f2d789a3ebac87a67cec75828
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -0,0 +1,1501 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RecurrentGradientMachine.h"
+#include <dlfcn.h>
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include "NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/AgentLayer.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+
+DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
+
+static const char* DIY_CALC_PROB_SYMBOL_NAME = "calc_prob";
+static const char* DIY_START_CALC_PROB_SYMBOL_NAME = "start_calc_prob";
+static const char* DIY_FINISH_CALC_PROB_SYMBOL_NAME = "finish_calc_prob";
+
+namespace paddle {
+
+/**
+ * Start Custom Calculate Probability callback type.
+ *
+ * @param nNode, nodes: the path will be explored. nNodes is array size.
+ *                      nodes is array elements.
+ *
+ * @return: A custom handler id that will passed to another callback.
+ */
+typedef int (*DiyStartCalcProbCallback)(size_t nNodes, int* nodes);
+
+/**
+ * Doing Custom Calculation of Probability callback type.
+ *
+ * @param handler: User custom handler. The return value from start calc prob.
+ * @param nNode, nodes: Array. The current path.
+ * @param curProb: The current log probability that neural network returns.
+ *
+ * @return: Log probability which user calculated, it will be updated to this
+ *          path.
+ * @NOTE: Return -INFINITY will DROP this path IMMEDIATELY!!
+ */
+typedef real (*DiyCalcProbCallback)(
+    int handler, size_t nNodes, int* nodes, real curProb, bool atEos);
+
+/**
+ * Finish Custom Calculation of Probability callback type.
+ *
+ * @param handler: User custom handler. The return value from start calc prob.
+ */
+typedef void (*DiyStopCalcProbCallback)(int handler);
+
+static DiyCalcProbCallback gDiyProbMethod = nullptr;
+static DiyStartCalcProbCallback gDiyProbStart = nullptr;
+static DiyStopCalcProbCallback gDiyProbStop = nullptr;
+static void* gDiyProbHandle = nullptr;
+
+static void exit_diy_prob() { dlclose(gDiyProbHandle); }
+
+template <typename SymbolType>
+static inline SymbolType loadDiySymbol(const char* symbolName) {
+  void* sym = dlsym(gDiyProbHandle, symbolName);
+  CHECK(sym) << "Cannot load symbol " << symbolName << " from "
+             << FLAGS_diy_beam_search_prob_so;
+  return reinterpret_cast<SymbolType>(sym);
+}
+
+static InitFunction __init__diy_prob_method(
+    [] {
+      std::string soName = FLAGS_diy_beam_search_prob_so;
+      if (!soName.empty()) {
+        gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
+        CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
+        atexit(exit_diy_prob);
+        gDiyProbMethod =
+            loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
+        gDiyProbStart = loadDiySymbol<decltype(gDiyProbStart)>(
+            DIY_START_CALC_PROB_SYMBOL_NAME);
+        gDiyProbStop = loadDiySymbol<decltype(gDiyProbStop)>(
+            DIY_FINISH_CALC_PROB_SYMBOL_NAME);
+      }
+    },
+    std::numeric_limits<int>::max());
+
+class BeamSearchControlCallbacks {
+ public:
+  RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback
+      beamSearchCandidateAdjust;
+  RecurrentGradientMachine::NormOrDropNodeCallback normOrDropNode;
+  RecurrentGradientMachine::DropCallback stopDetermineCandidates;
+
+  //! for gcc46 aggregate initialization is not very well, so we need to
+  //! explicit
+  BeamSearchControlCallbacks(
+      const RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback&
+          candidateAdjust,
+      const RecurrentGradientMachine::NormOrDropNodeCallback& norm,
+      const RecurrentGradientMachine::DropCallback& stop)
+      : beamSearchCandidateAdjust(candidateAdjust),
+        normOrDropNode(norm),
+        stopDetermineCandidates(stop) {}
+};
+
+class BeamSearchStatisticsCallbacks {
+ public:
+  RecurrentGradientMachine::EachStepCallback onEachStepStarted;
+  RecurrentGradientMachine::EachStepCallback onEachStepStoped;
+
+  BeamSearchStatisticsCallbacks(
+      const RecurrentGradientMachine::EachStepCallback& start,
+      const RecurrentGradientMachine::EachStepCallback& stop)
+      : onEachStepStarted(start), onEachStepStoped(stop) {}
+};
+
+RecurrentGradientMachine::RecurrentGradientMachine(
+    const std::string& subModelName, NeuralNetwork* rootNetwork)
+    : NeuralNetwork(subModelName),
+      rootNetwork_(rootNetwork),
+      beamSearchCtrlCallbacks_(nullptr),
+      beamSearchStatistics_(nullptr) {
+  CHECK(!subModelName_.empty());
+}
+
+/**
+ * bias layer, as input of memory frame 0 will give vector of zeros
+ * if bias parameter is not set.
+ *
+ * boot bias layer create directly in recurrent gradient machine, because:
+ *
+ * 1. It is only one frame, so it should not be placed in layer group,
+ *    which is one instance for every one frame.
+ *
+ * 2. It is no input layer, so it need resetHeight() before forward(),
+ *    and resetHeight() must be called in recurrent gradient machine,
+ *    so it's should not be placed in root network.
+ */
+class BootBiasLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+  IVectorPtr cpuIds_;
+
+ public:
+  explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    if (!Layer::init(layerMap, parameterMap)) return false;
+
+    if (biasParameter_) {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    }
+    return true;
+  }
+
+  void resetHeight(int height) {
+    if (config_.has_bos_id()) {  // used as a constant id layerConfig
+      IVector::resizeOrCreate(output_.ids, height, useGpu_);
+      output_.ids->reset((int)config_.bos_id());
+    } else {
+      resetOutput(height, getSize());
+    }
+  }
+
+  void forward(PassType passType) override {
+    if (biases_) {
+      MatrixPtr outV = getOutputValue();
+      outV->addBias(*(biases_->getW()), 1);
+      forwardActivation();
+    }
+  }
+
+  void backward(const UpdateCallback& callback) override {
+    if (biases_ && biases_->getWGrad()) {
+      backwardActivation();
+      biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+      biases_->getParameterPtr()->incUpdate(callback);
+    }
+  }
+};
+
+void RecurrentGradientMachine::init(
+    const ModelConfig& config,
+    ParamInitCallback callback,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
+  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
+  useGpu_ = useGpu;
+
+  auto subModelConfig =
+      std::find_if(config.sub_models().begin(),
+                   config.sub_models().end(),
+                   [this](const SubModelConfig& sub_model) {
+                     return sub_model.name() == this->subModelName_;
+                   });
+  CHECK(subModelConfig != config.sub_models().end());
+  reversed_ = subModelConfig->reversed();
+  generating_ = subModelConfig->has_generator();
+
+  inFrameLines_.resize(subModelConfig->in_links_size());
+  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
+    inFrameLines_[i].linkName = subModelConfig->in_links(i).link_name();
+    inFrameLines_[i].inLayer =
+        rootNetwork_->getLayer(subModelConfig->in_links(i).layer_name());
+  }
+
+  outFrameLines_.resize(subModelConfig->out_links_size());
+  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
+    auto& linkPair = subModelConfig->out_links(i);
+    outFrameLines_[i].layerName = linkPair.layer_name();
+    outFrameLines_[i].agentLayer = rootNetwork_->getLayer(linkPair.link_name());
+  }
+
+  memoryFrameLines_.resize(subModelConfig->memories_size());
+  for (size_t i = 0; i < memoryFrameLines_.size(); ++i) {
+    auto& memoryConfig = subModelConfig->memories(i);
+    memoryFrameLines_[i].layerName = memoryConfig.layer_name();
+    memoryFrameLines_[i].linkName = memoryConfig.link_name();
+    auto agentConfig =
+        std::find_if(config.layers().begin(),
+                     config.layers().end(),
+                     [&memoryConfig](const LayerConfig& layerConfig) {
+                       return layerConfig.name() == memoryConfig.link_name();
+                     });
+    CHECK(agentConfig != config.layers().end());
+    if (memoryConfig.has_boot_layer_name()) {
+      memoryFrameLines_[i].rootLayer =
+          rootNetwork_->getLayer(memoryConfig.boot_layer_name());
+
+      LayerConfig scatterConfig = *agentConfig;
+      memoryFrameLines_[i].rootAgent.reset(
+          new ScatterAgentLayer(scatterConfig));
+      memoryFrameLines_[i].rootAgent->init(LayerMap(), parameterMap_);
+
+      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].rootAgent;
+    } else {
+      LayerConfig biasConfig = *agentConfig;
+      if (memoryConfig.has_boot_bias_parameter_name()) {
+        biasConfig.set_bias_parameter_name(
+            memoryConfig.boot_bias_parameter_name());
+        biasConfig.set_active_type(memoryConfig.boot_bias_active_type());
+      } else if (memoryConfig.has_boot_with_const_id()) {
+        biasConfig.set_bos_id(memoryConfig.boot_with_const_id());
+      }
+      memoryFrameLines_[i].biasLayer.reset(new BootBiasLayer(biasConfig));
+      memoryFrameLines_[i].biasLayer->init(LayerMap(), parameterMap_);
+
+      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].biasLayer;
+    }
+
+    if (subModelConfig->has_generator()) {
+      memoryFrameLines_[i].scatterAgents.resize(2);
+      for (auto& agent : memoryFrameLines_[i].scatterAgents) {
+        agent.reset(new ScatterAgentLayer(*agentConfig));
+        agent->init(LayerMap(), parameterMap_);
+      }
+    }
+  }
+
+  if (subModelConfig->has_generator()) {
+    generator_.config = subModelConfig->generator();
+    eosFrameLine_.reset(new EosFrameLine);
+    maxSequenceLength_ = generator_.config.max_num_frames();
+  }
+
+  // get parameters actually used by this Layer Group
+  resizeOrCreateFrames(1);
+  for (auto& para : frames_[0]->getParameters()) {
+    if (para->getSharedCount() > 0) {
+      parameterIds_.push_back(para->getID());
+    }
+  }
+  for (auto& para : parameters_) {  // bias layer parameters
+    if (para->getSharedCount() > 0) {
+      parameterIds_.push_back(para->getID());
+    }
+  }
+}
+
+void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
+  if ((size_t)numFrames <= frames_.size()) {
+    return;
+  }
+
+  frames_.reserve(numFrames);
+  for (auto& inFrameLine : inFrameLines_) {
+    inFrameLine.agents.reserve(numFrames);
+  }
+  for (auto& outFrameLine : outFrameLines_) {
+    outFrameLine.frames.reserve(numFrames);
+  }
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.frames.reserve(numFrames);
+    memoryFrameLine.agents.reserve(numFrames);
+  }
+  if (eosFrameLine_) {
+    eosFrameLine_->layers.reserve(numFrames);
+  }
+
+  ParamInitCallback subParamInitCb = [this](int paramId, Parameter* para) {
+    para->enableSharedType(PARAMETER_VALUE,
+                           this->parameters_[paramId]->getBuf(PARAMETER_VALUE),
+                           this->parameters_[paramId]->getMat(PARAMETER_VALUE));
+    para->enableSharedType(
+        PARAMETER_GRADIENT,
+        this->parameters_[paramId]->getBuf(PARAMETER_GRADIENT),
+        this->parameters_[paramId]->getMat(PARAMETER_GRADIENT));
+  };
+
+  for (int i = frames_.size(); i < numFrames; ++i) {
+    std::unique_ptr<NeuralNetwork> frame(
+        NeuralNetwork::newNeuralNetwork(subModelName_));
+    frame->init(config_, subParamInitCb);
+
+    for (auto& inFrameLine : inFrameLines_) {
+      inFrameLine.agents.push_back(frame->getLayer(inFrameLine.linkName));
+    }
+
+    for (auto& outFrameLine : outFrameLines_) {
+      outFrameLine.frames.push_back(frame->getLayer(outFrameLine.layerName));
+    }
+    for (auto& memoryFrameLine : memoryFrameLines_) {
+      memoryFrameLine.frames.push_back(
+          frame->getLayer(memoryFrameLine.layerName));
+      memoryFrameLine.agents.push_back(
+          frame->getLayer(memoryFrameLine.linkName));
+    }
+    if (eosFrameLine_) {
+      eosFrameLine_->layers.push_back(
+          frame->getLayer(generator_.config.eos_layer_name()));
+    }
+
+    frames_.emplace_back(std::move(frame));
+  }
+}
+
+void RecurrentGradientMachine::resizeBootFrame(int numSequences) {
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (memoryFrameLine.biasLayer) {
+      auto biasLayer =
+          dynamic_cast<BootBiasLayer*>(memoryFrameLine.biasLayer.get());
+      CHECK_NOTNULL(biasLayer);
+      biasLayer->resetHeight(numSequences);
+    } else {  // check input root layer height
+      CHECK_EQ(numSequences,
+               memoryFrameLine.rootLayer->getOutput().getNumSequences());
+    }
+  }
+}
+
+void RecurrentGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
+  LOG(FATAL) << "should not use this function";
+}
+
+void RecurrentGradientMachine::checkInputConsistency(
+    int inlinkId, const std::vector<Argument::SeqInfo>& seqInfo) {
+  if (commonSeqInfo_.empty()) {
+    commonSeqInfo_.resize(seqInfo.size());
+    for (size_t i = 0; i < seqInfo.size(); ++i) {
+      commonSeqInfo_[i].topLevelLength = seqInfo[i].topLevelLength;
+      commonSeqInfo_[i].seqId = seqInfo[i].seqId;
+    }
+  } else {
+    CHECK_EQ(commonSeqInfo_.size(), seqInfo.size())
+        << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+        << " has mismatched number of sequences";
+    for (size_t i = 0; i < seqInfo.size(); ++i) {
+      CHECK_EQ(commonSeqInfo_[i].topLevelLength, seqInfo[i].topLevelLength)
+          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+          << " has mismatched sequence length";
+      CHECK_EQ(commonSeqInfo_[i].seqId, seqInfo[i].seqId)
+          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+          << " has mismatched sequence length";
+    }
+  }
+}
+
+void RecurrentGradientMachine::calcNumSequencesAtEachStep() {
+  int numSequences = commonSeqInfo_.size();
+  numSeqs_.resize(maxSequenceLength_);
+  for (int i = 0; i < numSequences; ++i) {
+    for (int j = 0; j < commonSeqInfo_[i].topLevelLength; ++j) {
+      numSeqs_[j] = i + 1;
+    }
+  }
+}
+
+void RecurrentGradientMachine::reorganizeInput(PassType passType) {
+  info_.clear();
+  info_.resize(inFrameLines_.size());
+
+  commonSeqInfo_.clear();
+  seqInfos_.clear();
+  seqInfos_.resize(inFrameLines_.size());
+
+  for (size_t i = 0; i < inFrameLines_.size(); i++) {
+    const Argument& input = inFrameLines_[i].inLayer->getOutput();
+    if (!input.hasSeq()) {
+      continue;
+    }
+    input.getSeqInfo(&seqInfos_[i]);
+    checkInputConsistency(i, seqInfos_[i]);
+  }
+  CHECK(!commonSeqInfo_.empty())
+      << "At least one input needs to be sequence or subsequence";
+  maxSequenceLength_ = commonSeqInfo_[0].topLevelLength;
+
+  calcNumSequencesAtEachStep();
+
+  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
+    const Argument& input = inFrameLines_[i].inLayer->getOutput();
+    if (!input.hasSeq()) {
+      seqInfos_[i] = commonSeqInfo_;
+    }
+    createInFrameInfo(i, input, passType);
+  }
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+
+    // inFrameLine select rows in real layer one time
+    for (size_t i = 0; i < inFrameLines_.size(); i++) {
+      selectRowsOneTime(inFrameLines_[i].inLayer,
+                        info_[i].allIds,
+                        &(inFrameLines_[i].outArg),
+                        passType);
+    }
+  }
+}
+
+void RecurrentGradientMachine::reorganizeOutput(PassType passType) {
+  calcSequenceStartPositions();
+  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
+    Info info;
+    auto& outFrameLine = outFrameLines_[i];
+    ICpuGpuVectorPtr sequenceStartPositions;
+    ICpuGpuVectorPtr subSequenceStartPositions;
+    createOutFrameInfo(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+    auto gatherAgent =
+        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
+    CHECK_NOTNULL(gatherAgent);
+    gatherAgent->copyIdAndSequenceInfo(sequenceStartPositions,
+                                       subSequenceStartPositions,
+                                       info.allIds,
+                                       info.idIndex);
+  }
+}
+
+void RecurrentGradientMachine::connectFrames(PassType passType) {
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (memoryFrameLine.rootAgent) {
+      auto scatterAgent =
+          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
+      createMemoryFrameInfo(&memoryFrameLine, passType);
+      scatterAgent->setRealLayerAndOutput(memoryFrameLine.rootLayer,
+                                          memoryFrameLine.outArg,
+                                          memoryFrameLine.allIds,
+                                          /* idIndex */ 0,
+                                          memoryFrameLine.allIds->getSize(),
+                                          /* handleBackward */ true);
+      if (memoryFrameLine.sequenceStartPositions) {
+        int size = memoryFrameLine.sequenceStartPositions->getSize();
+        scatterAgent->setSequenceStartPositions(
+            memoryFrameLine.sequenceStartPositions,
+            /* seqStartPosIndex */ 0,
+            size);
+      }
+    }
+  }
+
+  for (auto& outFrameLine : outFrameLines_) {
+    auto gatherAgent =
+        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
+    gatherAgent->clearRealLayers();
+  }
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    // connect in_links
+    for (size_t j = 0; j < inFrameLines_.size(); ++j) {
+      Info& info = info_[j];
+      // idSize denotes the sum number of tokens in each length i
+      int idIndex = info.idIndex.empty() ? 0 : info.idIndex[i];
+      int idSize = info.idIndex.empty() ? numSeqs_[i]
+                                        : info.idIndex[i + 1] - info.idIndex[i];
+      InFrameLine inFrameLine = inFrameLines_[j];
+      auto scatterAgent =
+          dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
+      scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
+                                          inFrameLine.outArg,
+                                          info.allIds,
+                                          idIndex,
+                                          idSize,
+                                          i == 0);
+      if (info.sequenceStartPositions) {
+        // size: the length of subsequence
+        int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
+        scatterAgent->setSequenceStartPositions(
+            info.sequenceStartPositions, info.seqStartPosIndex[i], size);
+      }
+    }
+
+    // connect out_links
+    for (auto& outFrameLine : outFrameLines_) {
+      auto gatherAgent =
+          dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
+      gatherAgent->addRealLayer(outFrameLine.frames[i]);
+    }
+    for (auto& memoryFrameLine : memoryFrameLines_) {
+      NeuralNetwork::connect(
+          memoryFrameLine.agents[i],
+          i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1],
+          numSeqs_[i] /*height of agent*/);
+    }
+  }
+}
+
+void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
+                                       std::vector<Argument>* outArgs,
+                                       PassType passType) {
+  /* inArgs and outArgs are not used.
+     The inputs are inFrameLines_[i].inLayer.
+     The outputs are outFramesLines_[i].agentLayer
+   */
+
+  if (generating_) {
+    generateSequence();
+    return;
+  }  // else forward..
+
+  reorganizeInput(passType);
+  int numSequences = commonSeqInfo_.size();
+
+  resizeOrCreateFrames(maxSequenceLength_);
+  resizeBootFrame(numSequences);
+
+  connectFrames(passType);
+
+  REGISTER_TIMER_INFO("RecurrentFwTime", "RecurrentFwTime");
+  // forward
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.bootLayer->forward(passType);
+  }
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    const std::vector<Argument> inArgs;
+    std::vector<Argument> outArgs;
+    frames_[i]->forward(inArgs, &outArgs, passType);
+  }
+
+  reorganizeOutput(passType);
+}
+
+void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
+  if (generating_) {
+    return;
+  }
+  REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime");
+  AsyncGpuBlock asyncGpuBlock;
+  for (int i = maxSequenceLength_ - 1; i >= 0; --i) {
+    frames_[i]->backward(nullptr);
+  }
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.bootLayer->backward(nullptr);
+  }
+}
+
+void RecurrentGradientMachine::forwardBackward(
+    const std::vector<Argument>& inArgs,
+    std::vector<Argument>* outArgs,
+    PassType passType,
+    const UpdateCallback& callback) {
+  LOG(FATAL) << "should not use this function";
+}
+
+void RecurrentGradientMachine::eval(Evaluator* evaluator) const {
+  // call printers frame by frame
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    VLOG(2) << "Recurrent Layer Group eval frame " << i << " begin";
+    evaluator->eval(*(frames_[i].get()));
+    VLOG(2) << "Recurrent Layer Group eval frame " << i << " end";
+  }
+}
+
+void RecurrentGradientMachine::registerBeamSearchControlCallbacks(
+    const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
+    const NormOrDropNodeCallback& normOrDropNode,
+    const DropCallback& stopBeamSearch) {
+  this->removeBeamSearchControlCallbacks();
+  //! for gcc 46, aggregate initialization is not supported. TAT
+  this->beamSearchCtrlCallbacks_ = new BeamSearchControlCallbacks(
+      adjustBeamSearch, normOrDropNode, stopBeamSearch);
+}
+
+void RecurrentGradientMachine::removeBeamSearchControlCallbacks() {
+  if (this->beamSearchCtrlCallbacks_) {
+    delete this->beamSearchCtrlCallbacks_;
+    this->beamSearchCtrlCallbacks_ = nullptr;
+  }
+}
+
+void RecurrentGradientMachine::registerBeamSearchStatisticsCallbacks(
+    const EachStepCallback& onEachStepStarted,
+    const EachStepCallback& onEachStepStoped) {
+  this->removeBeamSearchStatisticsCallbacks();
+  this->beamSearchStatistics_ =
+      new BeamSearchStatisticsCallbacks(onEachStepStarted, onEachStepStoped);
+}
+
+void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
+  if (this->beamSearchStatistics_) {
+    delete this->beamSearchStatistics_;
+    this->beamSearchStatistics_ = nullptr;
+  }
+}
+
+namespace {
+void lenToStarts(std::vector<int>& starts) {
+  int pos = 0;
+  starts.back() = 0;
+  for (auto& start : starts) {
+    int tmp = start;
+    start = pos;
+    pos += tmp;
+  }
+  starts.back() = pos;
+}
+}  // namespace
+
+void RecurrentGradientMachine::calcSequenceStartPositions() {
+  std::vector<int> starts(commonSeqInfo_.size() + 1);
+  for (auto& seqInfo : commonSeqInfo_) {
+    starts[seqInfo.seqId] = seqInfo.topLevelLength;
+  }
+  lenToStarts(starts);
+  ICpuGpuVector::resizeOrCreate(sequenceStartPositions_, starts.size(), false);
+  std::copy(starts.begin(),
+            starts.end(),
+            sequenceStartPositions_->getMutableData(false));
+}
+
+void RecurrentGradientMachine::checkOutputConsistency(
+    OutFrameLine& outFrameLine) {
+  bool hasSeq = outFrameLine.frames[0]->getOutput().hasSeq();
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    CHECK_EQ(hasSeq, frame->getOutput().hasSeq());
+    int numSequences = frame->getOutput().getNumSequences();
+    CHECK_EQ(numSeqs_[i], numSequences);
+  }
+}
+
+void RecurrentGradientMachine::createOutFrameInfo(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  checkOutputConsistency(outFrameLine);
+
+  if (!outFrameLine.frames[0]->getOutput().hasSeq()) {
+    createOutFrameInfo_seq(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+  } else {
+    createOutFrameInfo_subseq(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+  }
+}
+
+void RecurrentGradientMachine::createOutFrameInfo_seq(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  std::vector<int> allIds;
+  info.idIndex.resize(1, 0);  // first idIndex = 0
+
+  const int* starts = sequenceStartPositions_->getData(false);
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    for (size_t j = 0; j < numSequences; ++j) {
+      int seqStart = starts[commonSeqInfo_[j].seqId];
+      int seqLength = commonSeqInfo_[j].topLevelLength;
+      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
+                                 : (seqStart + i));
+    }
+    info.idIndex.push_back(allIds.size());
+  }
+  sequenceStartPositions = sequenceStartPositions_;
+  copyScattedId(allIds, &info.allIds, allIds.size());
+  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
+}
+
+void RecurrentGradientMachine::createOutFrameInfo_subseq(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  size_t numSequences = commonSeqInfo_.size();
+  std::vector<int> allIds;
+  info.idIndex.resize(1, 0);  // first idIndex = 0
+
+  const int* starts = sequenceStartPositions_->getData(false);
+  std::vector<int> subStarts(starts[numSequences] + 1);
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    const int* seqStarts =
+        frame->getOutput().sequenceStartPositions->getData(false);
+    for (size_t j = 0; j < numSequences; ++j) {
+      subStarts[starts[commonSeqInfo_[j].seqId] + i] =
+          seqStarts[j + 1] - seqStarts[j];
+    }
+  }
+  lenToStarts(subStarts);
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    for (size_t j = 0; j < numSequences; ++j) {
+      int pos = starts[commonSeqInfo_[j].seqId] + i;
+      int subSeqStart = subStarts[pos];
+      int subSeqEnd = subStarts[pos + 1];
+      for (int k = subSeqStart; k < subSeqEnd; ++k) {
+        allIds.push_back(k);
+      }
+    }
+    info.idIndex.push_back(allIds.size());
+  }
+
+  ICpuGpuVector::resizeOrCreate(
+      subSequenceStartPositions, subStarts.size(), false);
+  int* cpuSubSequenceStartPositions =
+      subSequenceStartPositions->getMutableData(false);
+  std::copy(subStarts.begin(), subStarts.end(), cpuSubSequenceStartPositions);
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
+  int* cpuSequenceStartPositions =
+      sequenceStartPositions->getMutableData(false);
+  for (size_t i = 0; i <= numSequences; ++i) {
+    cpuSequenceStartPositions[i] = subStarts[starts[i]];
+  }
+  copyScattedId(allIds, &info.allIds, allIds.size());
+  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
+}
+
+/* create scattered id infomation for all realLayer of inFrameLines one time.
+ * If hasSubseq, will also create scattered sequenceStartPositions infomation
+ * for all realLayer of inFrameLines one time.
+ */
+void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
+                                                 const Argument& input,
+                                                 PassType passType) {
+  if (!input.hasSeq()) {
+    createInFrameInfo_nonseq(inlinkId, input, passType);
+  } else if (!input.hasSubseq()) {
+    createInFrameInfo_seq(inlinkId, input, passType);
+  } else {
+    createInFrameInfo_subseq(inlinkId, input, passType);
+  }
+}
+
+void RecurrentGradientMachine::createInFrameInfo_nonseq(int inlinkId,
+                                                        const Argument& input,
+                                                        PassType passType) {
+  std::vector<int> allIds;
+
+  auto& seqInfo = seqInfos_[inlinkId];
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.clear();
+  for (size_t i = 0; i < seqInfo.size(); ++i) {
+    allIds.push_back(seqInfo[i].seqId);
+  }
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+}
+
+void RecurrentGradientMachine::createInFrameInfo_seq(int inlinkId,
+                                                     const Argument& input,
+                                                     PassType passType) {
+  std::vector<int> allIds;
+  auto& seqInfo = seqInfos_[inlinkId];
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    for (int j = 0; j < numSeqs_[i]; ++j) {
+      int seqLength = seqInfo[j].topLevelLength;
+      int seqStart = seqInfo[j].seqStart;
+      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
+                                 : (seqStart + i));
+    }
+    inlinkInfo->idIndex.push_back(allIds.size());
+  }
+
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+  CHECK_EQ(inlinkInfo->idIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
+}
+void RecurrentGradientMachine::createInFrameInfo_subseq(int inlinkId,
+                                                        const Argument& input,
+                                                        PassType passType) {
+  std::vector<int> allIds;
+
+  auto& seqInfo = seqInfos_[inlinkId];
+
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
+  std::vector<int> sequenceStartPositions;
+  const int* subSequenceStartPositions = nullptr;
+
+  subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
+  inlinkInfo->seqStartPosIndex.clear();
+  inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    sequenceStartPositions.push_back(0);  // first element = 0
+    for (int j = 0; j < numSeqs_[i]; ++j) {
+      int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
+      int subSeqEnd = subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
+      for (int k = subSeqStart; k < subSeqEnd; ++k) {
+        allIds.push_back(k);
+      }
+      sequenceStartPositions.push_back(sequenceStartPositions.back() +
+                                       subSeqEnd - subSeqStart);
+    }
+    inlinkInfo->idIndex.push_back(allIds.size());
+    inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
+  }
+  // inFrameLine create sequenceStartPositions one time
+  CHECK_EQ(
+      sequenceStartPositions.size(),
+      static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
+  CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
+  createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
+
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+  CHECK_EQ(inlinkInfo->idIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
+}
+
+/* like createInFrameInfo, but for all realLayer of memoryFrameLines*/
+void RecurrentGradientMachine::createMemoryFrameInfo(
+    MemoryFrameLine* memoryFrameLine, PassType passType) {
+  const Argument& input = (*memoryFrameLine).rootLayer->getOutput();
+  size_t numSequences = input.getNumSequences();
+  std::vector<int> allIds;
+  bool seqFlag = input.hasSeq();
+  CHECK(!input.hasSubseq())
+      << "Subsequence boot layer for memory is not supported";
+
+  if (seqFlag) {  // for sequenceScatterAgentLayer
+    std::vector<int> sequenceStartPositions;
+    sequenceStartPositions.push_back(0);  // first element = 0
+    const int* starts = input.sequenceStartPositions->getData(false);
+    for (size_t i = 0; i < numSequences; ++i) {
+      // memory info adopt info of inlinks[0]
+      int seqId = seqInfos_[0][i].seqId;
+      for (int k = starts[seqId]; k < starts[seqId + 1]; ++k) {
+        allIds.push_back(k);
+      }
+      sequenceStartPositions.push_back(sequenceStartPositions.back() +
+                                       starts[seqId + 1] - starts[seqId]);
+    }
+    createSeqPos(sequenceStartPositions,
+                 &(*memoryFrameLine).sequenceStartPositions);
+
+  } else {  // for scatterAgentLayer
+    for (size_t i = 0; i < numSequences; ++i) {
+      allIds.push_back(seqInfos_[0][i].seqId);
+    }
+  }
+  // copy and check scatterId
+  copyScattedId(allIds, &(*memoryFrameLine).allIds, input.getBatchSize());
+  // memoryFrameLine select rows in real layer one time
+  selectRowsOneTime((*memoryFrameLine).rootLayer,
+                    (*memoryFrameLine).allIds,
+                    &(*memoryFrameLine).outArg,
+                    passType);
+}
+
+void RecurrentGradientMachine::copyScattedId(std::vector<int>& srcIds,
+                                             IVectorPtr* dstIds,
+                                             int size) {
+  int idSize = srcIds.size();
+  CHECK_EQ(idSize, size);
+  IVector::resizeOrCreate(*dstIds, idSize, useGpu_);
+  (*dstIds)->copyFrom(srcIds.data(), idSize);
+  // check
+  std::sort(srcIds.begin(), srcIds.end());
+  for (int i = 0; i < idSize; ++i) {
+    CHECK_EQ(srcIds[i], i);
+  }
+}
+
+void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
+                                                 const IVectorPtr& allIds,
+                                                 Argument* arg,
+                                                 PassType passType) {
+  Argument& src = layer->getOutput();
+  if (src.value) {
+    const MatrixPtr& realV = src.value;
+    int height = realV->getHeight();
+    int width = realV->getWidth();
+    Matrix::resizeOrCreate(
+        arg->value, height, width, /* trans */ false, useGpu_);
+    arg->value->zeroMem();
+    arg->value->selectRows(*realV, *allIds);
+    if (passType != PASS_TEST) {
+      Matrix::resizeOrCreate(
+          arg->grad, height, width, /* trans */ false, useGpu_);
+      arg->grad->zeroMem();
+    }
+  }
+  if (src.ids) {
+    IVector::resizeOrCreate(arg->ids, src.ids->getSize(), useGpu_);
+    arg->ids->selectFrom(*src.ids, *allIds);
+  }
+}
+
+void RecurrentGradientMachine::createSeqPos(
+    const std::vector<int>& sequenceStartPosition,
+    ICpuGpuVectorPtr* sequenceStartPositions) {
+  int size = sequenceStartPosition.size();
+  const int* data = sequenceStartPosition.data();
+  ICpuGpuVector::resizeOrCreate(*sequenceStartPositions, size, false);
+  (*sequenceStartPositions)->copyFrom(data, size, false);
+}
+
+size_t RecurrentGradientMachine::getGenBatchSize() {
+  size_t numSequences = 0;
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (!memoryFrameLine.rootLayer) continue;
+    Argument& bootArg = memoryFrameLine.rootLayer->getOutput();
+    size_t batchSize = bootArg.getNumSequences();
+    if (numSequences) {
+      CHECK_EQ(numSequences, batchSize);
+    } else {
+      numSequences = batchSize;
+    }
+  }
+  CHECK(numSequences)
+      << "Fail to get batch size in generation. "
+         "At least one of the Memory layer MUST have a layer that is NOT in "
+         "the layer group to boot it, and this boot layer is used to "
+         "decide batch_size in generation process.";
+  return numSequences;
+}
+
+void RecurrentGradientMachine::generateSequence() {
+  CHECK_NOTNULL(eosFrameLine_.get());
+  CHECK_GE(outFrameLines_.size(), 1UL);
+  size_t numSequences = getGenBatchSize();
+
+  resizeBootFrame(numSequences);
+  // We create only two sub-network in generation, one stores states of all
+  // layers in previous time step and the other storing the states at current
+  // time step.
+  resizeOrCreateFrames(2);
+
+  // outFrameLines_.size() > 1UL
+  dataArgsSize_ = outFrameLines_.size() - 1;
+  dataArgs_.resize(dataArgsSize_);
+  dataArgsFrame_.clear();
+  dataArgsFrame_.resize(dataArgsSize_);
+
+  // connect boot frame memory links
+  std::vector<int> ids(numSequences);
+  for (size_t i = 0; i < numSequences; ++i) {
+    ids[i] = i;
+  }
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (memoryFrameLine.rootAgent) {
+      auto scatterAgent =
+          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
+      scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids);
+    }
+    NeuralNetwork::connect(
+        memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size());
+  }
+
+  // boot layer forward
+  AsyncGpuBlock asyncGpuBlock;
+
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.bootLayer->forward(PASS_TEST);
+  }
+
+  // init outArg
+  size_t resultNum = generator_.config.num_results_per_sample();
+  size_t maxGenWordCount =
+      generator_.config.max_num_frames() * numSequences * resultNum;
+  IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false);
+  if (resultNum > 1) {
+    CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
+    Matrix::resizeOrCreate(generator_.outArg.in,
+                           /* height */ numSequences,
+                           /* width */ resultNum,
+                           false,
+                           /* useGpu */ false);
+  }
+  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
+                                numSequences + 1,
+                                /* useGpu */ false);
+  if (getBeamSize() > 1) {
+    beamSearch(numSequences);
+  } else {
+    oneWaySearch(numSequences);
+  }
+  if (dataArgsSize_) createDataOutlink();
+
+  size_t size = generator_.ids.size();
+  generator_.outArg.ids->resize(size);
+  generator_.outArg.ids->copyFrom(generator_.ids.data(), size);
+
+  OutFrameLine& outFrameLine = outFrameLines_[0];
+  auto dataAgent = dynamic_cast<DataLayer*>(outFrameLine.agentLayer.get());
+  CHECK_NOTNULL(dataAgent);
+  dataAgent->setData(generator_.outArg);
+  dataAgent->prefetch();
+}
+
+void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
+  OutFrameLine& outFrameLine = outFrameLines_[0];
+
+  // finalPaths_[0] stores the generated results of the
+  // entire batch, so its size exactly equals to batchSize.
+  finalPaths_.clear();
+  finalPaths_.resize(1);
+  std::vector<Path>& finalPaths = finalPaths_[0];
+  finalPaths.resize(batchSize);
+
+  seqIds_.resize(batchSize);
+  std::vector<int> scatterIds;
+  for (size_t i = 0; i < batchSize; ++i) {
+    finalPaths[i].seqId = i;
+    seqIds_[i] = i;
+  }
+
+  // forward
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    if (i && scatterIds.empty()) break;
+    int machineCur = i % 2;
+    int machinePrev = (i - 1) % 2;
+    // connect memory links
+    if (i) {
+      seqIds_.clear();
+      for (size_t j = 0; j < batchSize; ++j) {
+        if (finalPaths[j].seqId != -1) seqIds_.push_back(j);
+      }
+
+      for (auto& memoryFrameLine : memoryFrameLines_) {
+        auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
+            memoryFrameLine.scatterAgents[machineCur].get());
+        scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
+                                   scatterIds);
+        scatterAgent->forward(PASS_TEST);
+        NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
+                               memoryFrameLine.scatterAgents[machineCur]);
+      }
+    }
+    const std::vector<Argument> inArgs;
+    std::vector<Argument> outArgs;
+    frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
+
+    const IVectorPtr& idVec = outFrameLine.frames[machineCur]->getOutput().ids;
+    for (size_t j = 0; j < seqIds_.size(); ++j) {
+      finalPaths[seqIds_[j]].ids.push_back(idVec->getElement(j));
+      finalPaths[seqIds_[j]].machineIdVec.push_back(j);
+    }
+
+    copyDataOutlinkFrame(machineCur);
+
+    // check eos
+    const IVectorPtr& eosVec =
+        eosFrameLine_->layers[machineCur]->getOutput().ids;
+    scatterIds.clear();
+    for (size_t j = 0; j < seqIds_.size(); ++j) {
+      if (eosVec->getElement(j) == 1U) {
+        // path.seqId = -1 indicates end of generation
+        // of an input sequence
+        finalPaths[seqIds_[j]].seqId = -1;
+      } else {
+        scatterIds.push_back(j);
+      }
+    }
+  }
+
+  batchMachineIdVec_.clear();
+  batchMachineStartPos_.clear();
+  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
+  starts[0] = 0;
+  generator_.ids.clear();
+  for (size_t i = 0; i < batchSize; ++i) {
+    generator_.ids.insert(generator_.ids.end(),
+                          finalPaths[i].ids.begin(),
+                          finalPaths[i].ids.end());
+    starts[i + 1] = generator_.ids.size();
+    batchMachineIdVec_.insert(batchMachineIdVec_.end(),
+                              finalPaths[i].machineIdVec.begin(),
+                              finalPaths[i].machineIdVec.end());
+  }
+}
+
+void RecurrentGradientMachine::connectPrevFrame(int stepId,
+                                                std::vector<Path>& paths) {
+  int machineCur = stepId % 2;
+  int machinePrev = (stepId - 1) % 2;
+  int beam = getBeamSize();
+  machineIds_.clear();
+  topIds_.clear();
+  seqIds_.clear();
+
+  for (size_t j = 0; j < paths.size(); ++j) {
+    machineIds_.push_back(paths[j].machineId);
+    topIds_.push_back(paths[j].machineId * beam + paths[j].topIndex);
+    seqIds_.push_back(paths[j].seqId);
+  }
+
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    bool isOutIds = (memoryFrameLine.layerName == outFrameLines_[0].layerName);
+    auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
+        memoryFrameLine.scatterAgents[machineCur].get());
+    scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
+                               isOutIds ? topIds_ : machineIds_);
+    scatterAgent->forward(PASS_TEST);
+    NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
+                           memoryFrameLine.scatterAgents[machineCur]);
+  }
+}
+
+void RecurrentGradientMachine::forwardFrame(int machineCur) {
+  // forward
+  const std::vector<Argument> inArgs;
+  std::vector<Argument> outArgs;
+  frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
+
+  copyDataOutlinkFrame(machineCur);
+
+  IVectorPtr& ids = outFrameLines_[0].frames[machineCur]->getOutput().ids;
+  MatrixPtr in = outFrameLines_[0].frames[machineCur]->getOutput().in;
+  IVectorPtr& eos = eosFrameLine_->layers[machineCur]->getOutput().ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuId_, ids->getSize(), false /* useGpu */);
+    cpuId_->copyFrom(*ids);
+    Matrix::resizeOrCreate(cpuProb_,
+                           in->getHeight(),
+                           in->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    cpuProb_->copyFrom(*in);
+    IVector::resizeOrCreate(cpuEos_, eos->getSize(), false /* useGpu */);
+    cpuEos_->copyFrom(*eos);
+  } else {
+    cpuId_ = ids;
+    cpuProb_ = in;
+    cpuEos_ = eos;
+  }
+}
+
+void RecurrentGradientMachine::singlePathExpand(Path& curPath,
+                                                size_t curPathId,
+                                                std::vector<Path>& newPaths,
+                                                size_t expandWidth) {
+  int calc_id =
+      gDiyProbStart ? gDiyProbStart(curPath.ids.size(), curPath.ids.data()) : 0;
+
+  const int* idVec = cpuId_->getData();
+  const real* probMat = cpuProb_->getData();
+  const int* eosVec = cpuEos_->getData();
+
+  for (size_t k = 0; k < expandWidth; k++) {
+    int index = curPathId * expandWidth + k;
+    int id = idVec[index];
+    real prob = probMat[index];
+    /*
+     * Ordinarily, beam search greedily expands the most promising expandWidth
+     * paths that currently are ALWAYS returned by MaxIdLayer.
+     * In one condition, if user customizes the beam search procedure by
+     * restricting the expansion within a user defined subset,
+     * as a result, MaxIdLayer possibly COULD NOT return expandWidth
+     * vaild expansions, and it will use -1 to indicate the end of valid
+     * expansion candidates.
+     */
+    if (id == -1) break;
+
+    real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob;
+    Path newPath(
+        curPath, id, newLogProb, curPathId /*machineId*/, k /*topIndex*/);
+    if (this->beamSearchCtrlCallbacks_) {
+      if (beamSearchCtrlCallbacks_->stopDetermineCandidates(
+              newPath.seqId, newPath.ids, newPath.probHistory))
+        return;
+    }
+    // outFrameLines_.size() > 1UL
+    if (dataArgsSize_) {
+      newPath.machineIdVec = curPath.machineIdVec;
+      newPath.machineIdVec.push_back(curPathId);
+    }
+    bool atEos =
+        eosVec[index] == 1U || newPath.ids.size() >= (size_t)maxSequenceLength_;
+    // adjustNewPath
+    newPath.adjustProb(calc_id, atEos);
+    if (this->beamSearchCtrlCallbacks_) {
+      this->beamSearchCtrlCallbacks_->normOrDropNode(
+          newPath.seqId, newPath.ids, newPath.probHistory, &newPath.logProb);
+    }
+    if (!newPath.isDropable()) {
+      atEos ? finalPaths_[curPath.seqId].push_back(newPath)
+            : newPaths.push_back(newPath);
+    }
+  }  // for expandWidth
+
+  if (gDiyProbStop) {
+    gDiyProbStop(calc_id);
+  }
+}
+
+void RecurrentGradientMachine::beamExpand(std::vector<Path>& paths,
+                                          std::vector<Path>& newPaths) {
+  size_t candidatePathCount = paths.size();
+  // idVec.size() could be larger than candidatePathCount * beam,
+  // so user can drop some node customly.
+  CHECK_EQ(cpuId_->getSize() % candidatePathCount, 0UL);
+  size_t expandWidth = cpuId_->getSize() / candidatePathCount;
+
+  // iterate over each sequence
+  size_t totalExpandCount = 0;
+  int prevSeqId = -1;
+  int curSeqId = 0;
+  for (size_t j = 0; j <= candidatePathCount; j++) {
+    // expansions of a single sequence are all processed
+    curSeqId = (j < candidatePathCount ? paths[j].seqId : curSeqId + 1);
+    if (prevSeqId != -1 && curSeqId != prevSeqId) {
+      totalExpandCount += beamShrink(newPaths, prevSeqId, totalExpandCount);
+    }
+    if (j == candidatePathCount) return;
+    singlePathExpand(paths[j], j, newPaths, expandWidth);
+
+    prevSeqId = paths[j].seqId;
+  }  // for paths
+}
+
+// Drop extra nodes to beam size.
+size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
+                                            size_t seqId,
+                                            size_t totalExpandCount) {
+  size_t minNewPathSize =
+      std::min(getBeamSize(), newPaths.size() - totalExpandCount);
+  if (!minNewPathSize) {
+    return 0;
+  }
+  std::nth_element(newPaths.begin() + totalExpandCount,
+                   newPaths.begin() + totalExpandCount + minNewPathSize,
+                   newPaths.end(),
+                   Path::greaterPath);
+  newPaths.resize(totalExpandCount + minNewPathSize);
+
+  real minPathLogProb =
+      std::min_element(newPaths.end() - minNewPathSize, newPaths.end())
+          ->logProb;
+  real maxPathLogProb =
+      std::max_element(newPaths.end() - minNewPathSize, newPaths.end())
+          ->logProb;
+
+  // Remove the already formed paths that are relatively short
+  finalPaths_[seqId].erase(
+      std::remove_if(finalPaths_[seqId].begin(),
+                     finalPaths_[seqId].end(),
+                     [&](Path& p) { return p.logProb < minPathLogProb; }),
+      finalPaths_[seqId].end());
+  for (auto p : finalPaths_[seqId]) {
+    if (minFinalPathLogProb_[seqId] > p.logProb) {
+      minFinalPathLogProb_[seqId] = p.logProb;
+    }
+  }
+
+  if (finalPaths_[seqId].size() >= getBeamSize() &&
+      minFinalPathLogProb_[seqId] >= maxPathLogProb) {
+    newPaths.resize(totalExpandCount);
+    return 0;
+  }
+  return minNewPathSize;
+}
+
+void RecurrentGradientMachine::fillGenOutputs() {
+  size_t numResults = generator_.config.num_results_per_sample();
+  for (size_t i = 0; i < finalPaths_.size(); ++i) {
+    size_t minFinalPathsSize = std::min(numResults, finalPaths_[i].size());
+    std::partial_sort(finalPaths_[i].begin(),
+                      finalPaths_[i].begin() + minFinalPathsSize,
+                      finalPaths_[i].end(),
+                      Path::greaterPath);
+    finalPaths_[i].resize(minFinalPathsSize);
+  }
+
+  generator_.ids.clear();
+  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
+  starts[0] = 0;
+  if (numResults > 1) {
+    int idsProbSaveSize = 0;
+    for (auto inSeq : finalPaths_) {
+      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
+      idsProbSaveSize += inSeq.size();
+    }
+    Matrix::resizeOrCreate(
+        generator_.outArg.value, idsProbSaveSize, 1, false, false);
+    real* idsProb = generator_.outArg.value->getData();
+
+    real* probs = generator_.outArg.in->getData();
+    size_t curPos = 0;
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+        Path& path = finalPaths_[i][j];
+        size_t genLen = path.ids.size();
+        generator_.ids.push_back(genLen);  // sequence size
+        generator_.ids.insert(
+            generator_.ids.end(), path.ids.begin(), path.ids.end());
+        generator_.ids.push_back(-1);  // end of sequence
+
+        memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen);
+        curPos += genLen;
+        idsProb[curPos++] = -1.0;
+        probs[i * numResults + j] = path.logProb;
+      }
+      starts[i + 1] = generator_.ids.size();
+    }
+  } else {
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      CHECK(!finalPaths_[i].empty());
+      Path& path = finalPaths_[i][0];
+      generator_.ids.insert(
+          generator_.ids.end(), path.ids.begin(), path.ids.end());
+      starts[i + 1] = starts[i] + path.ids.size();
+    }
+  }
+}
+
+void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
+  for (size_t i = 0; i < dataArgsSize_; i++) {
+    Argument outFrame;
+    outFrame.resizeAndCopyFrom(
+        outFrameLines_[i + 1].frames[machineCur]->getOutput(), useGpu_);
+    dataArgsFrame_[i].emplace_back(outFrame);
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
+    bool isSeq, std::vector<Argument>& outArgs) {
+  batchMachineIdVec_.clear();
+
+  size_t seqIdx = 0;
+  for (size_t i = 0; i < finalPaths_.size(); ++i) {
+    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
+      if (isSeq) {
+        for (size_t i = 0; i < machineIdVec.size(); ++i) {
+          size_t rowId = machineIdVec[i];
+          int* seqPos =
+              outArgs[i].sequenceStartPositions->getMutableData(false);
+          batchMachineIdVec_.push_back(seqPos[rowId]);
+        }
+      } else {
+        batchMachineIdVec_.insert(
+            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
+      }
+      seqIdx++;
+    }
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
+    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
+  size_t totalSeqNum = std::accumulate(
+      finalPaths_.begin(),
+      finalPaths_.end(),
+      0UL,
+      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
+  copySize.resize(totalSeqNum, 1);
+
+  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
+  if (isSeq) {
+    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
+    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
+             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
+    int* starts = inputSeqStartPos->getMutableData(false);
+    int seqId = 0;
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
+                                            : starts[j + 1] - starts[j];
+        batchMachineStartPos_[seqId + 1] =
+            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
+        seqId++;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
+      batchMachineStartPos_[i + 1] =
+          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlink() {
+  for (size_t i = 0; i < dataArgsSize_; i++) {
+    bool isSeq = dataArgsFrame_[i][0].hasSeq();
+    std::vector<int> copySize;
+    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
+    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
+
+    dataArgs_[i].concat(dataArgsFrame_[i],
+                        batchMachineIdVec_,
+                        batchMachineStartPos_,
+                        copySize,
+                        useGpu_,
+                        HPPL_STREAM_1,
+                        PASS_TEST);
+    auto dataAgent =
+        dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
+    CHECK_NOTNULL(dataAgent);
+    dataAgent->setData(dataArgs_[i]);
+  }
+}
+
+void RecurrentGradientMachine::beamSearch(size_t batchSize) {
+  finalPaths_.clear();
+  finalPaths_.resize(batchSize);
+  seqIds_.resize(batchSize);
+  minFinalPathLogProb_.clear();
+  minFinalPathLogProb_.resize(batchSize, 0);
+
+  std::vector<Path> paths;
+  std::vector<Path> newPaths;
+  for (size_t i = 0; i < batchSize; ++i) {
+    paths.push_back(Path(i));
+    if (this->beamSearchCtrlCallbacks_) {
+      paths.back().recordHistory();
+    }
+  }
+
+  // restart beam search
+  stopBeamSearch_ = false;
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    int machineCur = i % 2;
+    std::unique_ptr<
+        ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&, int>>
+        statisticsBlock;
+    if (this->beamSearchStatistics_) {
+      auto ptr =
+          new ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&,
+                              int>(beamSearchStatistics_->onEachStepStarted,
+                                   beamSearchStatistics_->onEachStepStoped,
+                                   i);
+      statisticsBlock.reset(ptr);
+    }
+    if (stopBeamSearch_) break;
+
+    if (i) connectPrevFrame(i, paths);
+
+    if (this->beamSearchCtrlCallbacks_) {
+      std::vector<std::vector<int>*> prefixes;
+      prefixes.resize(paths.size());
+      std::transform(
+          paths.begin(), paths.end(), prefixes.begin(), [](const Path& p) {
+            return const_cast<std::vector<int>*>(&p.ids);
+          });
+      beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
+          prefixes, frames_[machineCur].get(), i);
+    }
+
+    forwardFrame(machineCur);
+    beamExpand(paths, newPaths);
+    if (newPaths.empty()) break;
+
+    paths = newPaths;
+    newPaths.clear();
+  }  // end for machineCur
+  fillGenOutputs();
+}
+
+void RecurrentGradientMachine::Path::adjustProb(int calc_id, bool atEos) {
+  if (gDiyProbMethod) {
+    logProb = gDiyProbMethod(calc_id, ids.size(), ids.data(), logProb, atEos);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
similarity index 100%
rename from paddle/gserver/gradientmachines/RecurrentGradientMachine.h
rename to paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
diff --git a/paddle/gserver/layers/AddtoLayer.cpp b/paddle/legacy/gserver/layers/AddtoLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/AddtoLayer.cpp
rename to paddle/legacy/gserver/layers/AddtoLayer.cpp
diff --git a/paddle/legacy/gserver/layers/AddtoLayer.h b/paddle/legacy/gserver/layers/AddtoLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f948de4756cfd3c6c990475bfac4f44004c9068
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AddtoLayer.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * This layer just simply add all input layers together, then activate
+ * the sum inputs. Each input of this layer should be the same size,
+ * which is also the output size of this layer.
+ * \f[
+ *   y=f(\sum_{i}x_i + b)
+ * \f]
+ * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is
+ * activation function.
+ *
+ * The config file api is addto_layer.
+ */
+class AddtoLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+
+ public:
+  explicit AddtoLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~AddtoLayer() {}
+
+  /**
+   * Intialization of AddtoLayer.
+   */
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  /**
+   * Forward propagation.
+   * @note There is no weight matrix for each input,
+   *       because it just a simple add operation.
+   */
+  void forward(PassType passType) override;
+
+  /**
+   * Backward propagation.
+   */
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/legacy/gserver/layers/AgentLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/AgentLayer.cpp
rename to paddle/legacy/gserver/layers/AgentLayer.cpp
diff --git a/paddle/legacy/gserver/layers/AgentLayer.h b/paddle/legacy/gserver/layers/AgentLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..f506db2f2dbfe844a7fcb45a16108dd8f2c660d9
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AgentLayer.h
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * AgentLayer use as a virtual input of another layer in config,
+ * before execute forward/backward, setRealLayer() should be
+ * called to set one and only one real layer
+ */
+class AgentLayer : public Layer {
+ protected:
+  LayerPtr realLayer_;
+  int numSamples_;
+
+ public:
+  explicit AgentLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~AgentLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  // if *numSamples* set,
+  // real layer output will only use first *numSamples* rows
+  void setRealLayer(LayerPtr layer, int numSamples = 0) {
+    realLayer_ = layer;
+    numSamples_ = numSamples;
+  }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override {}
+};
+
+/**
+ * Like AgentLayer, but it can gather many real layers. Each real
+ * layer give a few rows of a sequence, after gather all real layers,
+ * GatherAgentLayer collect a complete sequence.
+ */
+class GatherAgentLayer : public Layer {
+ protected:
+  std::vector<LayerPtr> realLayers_;
+  std::vector<IVectorPtr> idsVec_;
+  // we don't clear idsVec_ vector to aviod IVector alloc/free
+  IVectorPtr allIds_;
+  std::vector<int> idIndex_;
+
+ public:
+  explicit GatherAgentLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~GatherAgentLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  // call before addRealLayer
+  void clearRealLayers() { realLayers_.clear(); }
+
+  void copyIdAndSequenceInfo(ICpuGpuVectorPtr sequenceStartPositions,
+                             ICpuGpuVectorPtr subSequenceStartPositions,
+                             const IVectorPtr& allIds,
+                             const std::vector<int>& idIndex);
+
+  // add one real layer, can call many times
+  void addRealLayer(LayerPtr layer) { realLayers_.push_back(layer); }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+  void forwardValue(PassType passType);
+  void forwardIds(PassType passType);
+};
+
+/**
+ * Like AgentLayer, but only select a few rows in real layer.
+ * [idIndex, idIndex + idSize) of *ids* in setRealLayerAndOutput()
+ * are the selected row ids. It's used to scatter one layer's output
+ * to many small submodels. ScatterAgentLayer can support ids real layer,
+ * if it is, the agent will select a few ids in real layer.
+ */
+class ScatterAgentLayer : public Layer {
+ protected:
+  LayerPtr realLayer_;
+  IVectorPtr ids_;
+  IVectorPtr cpuIds_;
+  Argument realOutArg_;
+  int idIndex_;
+  int idSize_;
+  int seqStartPosIndex_;
+  int numSequences_;  // number of sequences in this scatterAgentLayer
+  bool handleBackward_;
+
+  // use to store expanded cpuStartPositions or subSequenceStartPositions
+  // of real layer.
+  ICpuGpuVectorPtr inputStartPos_;
+
+  // true for setRealLayer, false for setRealLayerAndOutput
+  bool selectionMode_;
+
+ public:
+  explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~ScatterAgentLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  /**
+   * @brief set real layer in generation
+   *
+   * @param layer[input]    realLayer
+   * @param ids[input]      row id in real layer
+   * @param copyId[input]   whether to copy a cpu version of ids,
+   *                        false(default) in ScatterAgentLayer, and
+   *                        true in SequenceScatterAgentLayer.
+   */
+  void setRealLayer(LayerPtr layer, const std::vector<int>& ids) {
+    realLayer_ = layer;
+    IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
+    ids_->copyFrom(ids.data(), ids.size());
+    if (useGpu_) {
+      IVector::resizeOrCreate(cpuIds_, ids.size(), false);
+      cpuIds_->copyFrom(ids.data(), ids.size());
+    } else {
+      cpuIds_ = ids_;
+    }
+    selectionMode_ = true;
+  }
+
+  // set real layer and output, [idIndex, idIndex + idSize) of *ids*
+  // are selected row for realOutArg in realLayer
+  void setRealLayerAndOutput(LayerPtr layer,
+                             const Argument& outArg,
+                             const IVectorPtr& ids,
+                             int idIndex,
+                             int idSize,
+                             bool handleBackward) {
+    realLayer_ = layer;
+    realOutArg_ = outArg;
+    ids_ = ids;
+    idIndex_ = idIndex;
+    idSize_ = idSize;
+    handleBackward_ = handleBackward;
+    selectionMode_ = false;
+  }
+
+  void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
+                                 int seqStartPosIndex,
+                                 int numSequences) {
+    realOutArg_.sequenceStartPositions = sequenceStartPositions;
+    seqStartPosIndex_ = seqStartPosIndex;
+    numSequences_ = numSequences;
+  }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+  void forwardWithSelection(PassType passType);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/legacy/gserver/layers/AverageLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/AverageLayer.cpp
rename to paddle/legacy/gserver/layers/AverageLayer.cpp
diff --git a/paddle/legacy/gserver/layers/AverageLayer.h b/paddle/legacy/gserver/layers/AverageLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0d457d35f4bce99860cf45e94525f323f45e286
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AverageLayer.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "SequencePoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * A layer for "internal average" for sequence input.
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = average_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the average pooling
+ *              operation is then applied to each interval independently.
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+class AverageLayer : public SequencePoolLayer {
+ public:
+  enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
+  explicit AverageLayer(const LayerConfig& config)
+      : SequencePoolLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  int mode_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/BatchNormBaseLayer.cpp
rename to paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/legacy/gserver/layers/BatchNormBaseLayer.h
similarity index 100%
rename from paddle/gserver/layers/BatchNormBaseLayer.h
rename to paddle/legacy/gserver/layers/BatchNormBaseLayer.h
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/BatchNormalizationLayer.cpp
rename to paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/legacy/gserver/layers/BatchNormalizationLayer.h
similarity index 100%
rename from paddle/gserver/layers/BatchNormalizationLayer.h
rename to paddle/legacy/gserver/layers/BatchNormalizationLayer.h
diff --git a/paddle/gserver/layers/BilinearInterpLayer.cpp b/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/BilinearInterpLayer.cpp
rename to paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
diff --git a/paddle/legacy/gserver/layers/BilinearInterpLayer.h b/paddle/legacy/gserver/layers/BilinearInterpLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c585a5ed10d9c8f241b5a5ff3a671752fda6d432
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BilinearInterpLayer.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for bilinear interpolation which is
+ *        used on conv layer output.
+ *
+ * @note  The config file api is bilinear_interp_layer.
+ */
+class BilinearInterpLayer : public Layer {
+ protected:
+  size_t outImgH_, outImgW_;
+  size_t inImgH_, inImgW_;
+  real ratioH_, ratioW_;
+  size_t numChannels_;
+
+ public:
+  explicit BilinearInterpLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~BilinearInterpLayer() {}
+
+  size_t getSize();
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/legacy/gserver/layers/BlockExpandLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/BlockExpandLayer.cpp
rename to paddle/legacy/gserver/layers/BlockExpandLayer.cpp
diff --git a/paddle/legacy/gserver/layers/BlockExpandLayer.h b/paddle/legacy/gserver/layers/BlockExpandLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b90249bfb0958f0081e7c668cd3b38a53c39951
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BlockExpandLayer.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Expand feature map to minibatch matrix.
+ * - matrix width is: blockH_ * blockW_ * channels_
+ * - matirx height is: outputH_ * outputW_
+ *
+ * \f[
+ * outputH\_ = 1 + (2 * paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) /
+ *             strideH\_ \\
+ * outputW\_ = 1 + (2 * paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) /
+ *             strideW\_
+ * \f]
+ *
+ * The expand method is the same with ExpandConvLayer, but saved the transposed
+ * value. After expanding, output_.sequenceStartPositions will store timeline.
+ * The number of time steps are outputH_ * outputW_ and the dimension of each
+ * time step is blockH_ * blockW_ * channels_. This layer can be used after
+ * convolution neural network, and before recurrent neural network.
+ *
+ * The config file api is block_expand_layer.
+ */
+class BlockExpandLayer : public Layer {
+ protected:
+  /**
+   * @brief Calculate outputH_ and outputW_ and return block number which
+   * actually is time steps.
+   * @return time steps, outoutH_ * outputW_.
+   */
+  size_t getBlockNum();
+  size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
+  size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
+
+  TensorShape inputShape_;
+  TensorShape outputShape_;
+
+ public:
+  explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~BlockExpandLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/legacy/gserver/layers/CRFDecodingLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/CRFDecodingLayer.cpp
rename to paddle/legacy/gserver/layers/CRFDecodingLayer.cpp
diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/legacy/gserver/layers/CRFDecodingLayer.h
similarity index 100%
rename from paddle/gserver/layers/CRFDecodingLayer.h
rename to paddle/legacy/gserver/layers/CRFDecodingLayer.h
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/legacy/gserver/layers/CRFLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/CRFLayer.cpp
rename to paddle/legacy/gserver/layers/CRFLayer.cpp
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/legacy/gserver/layers/CRFLayer.h
similarity index 100%
rename from paddle/gserver/layers/CRFLayer.h
rename to paddle/legacy/gserver/layers/CRFLayer.h
diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/legacy/gserver/layers/CTCLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/CTCLayer.cpp
rename to paddle/legacy/gserver/layers/CTCLayer.cpp
diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/legacy/gserver/layers/CTCLayer.h
similarity index 100%
rename from paddle/gserver/layers/CTCLayer.h
rename to paddle/legacy/gserver/layers/CTCLayer.h
diff --git a/paddle/gserver/layers/ClipLayer.cpp b/paddle/legacy/gserver/layers/ClipLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/ClipLayer.cpp
rename to paddle/legacy/gserver/layers/ClipLayer.cpp
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/legacy/gserver/layers/ConcatenateLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/ConcatenateLayer.cpp
rename to paddle/legacy/gserver/layers/ConcatenateLayer.cpp
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/legacy/gserver/layers/ContextProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/ContextProjection.cpp
rename to paddle/legacy/gserver/layers/ContextProjection.cpp
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/legacy/gserver/layers/ContextProjection.h
similarity index 100%
rename from paddle/gserver/layers/ContextProjection.h
rename to paddle/legacy/gserver/layers/ContextProjection.h
diff --git a/paddle/gserver/layers/Conv3DLayer.cpp b/paddle/legacy/gserver/layers/Conv3DLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/Conv3DLayer.cpp
rename to paddle/legacy/gserver/layers/Conv3DLayer.cpp
diff --git a/paddle/legacy/gserver/layers/Conv3DLayer.h b/paddle/legacy/gserver/layers/Conv3DLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb42a2f36d31365b473d7f593fd27dc063c83c47
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Conv3DLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of convolution layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate convolution operation.
+ */
+class Conv3DLayer : public ConvBaseLayer {
+ public:
+  explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  ~Conv3DLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void addBias();
+  void backward(const UpdateCallback& callback);
+  void bpropBiases();
+  void bpropData(int i);
+  void bpropWeights(int i);
+  size_t getSize();
+
+ protected:
+  // Figure out the dimensions for individual gemms.
+  IntV M_;  /// numFilters_ / filter_group_;
+  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
+  IntV K_;  /// outputD_ * outputH_ * outputW_
+  MatrixPtr colBuf_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseLayer.cpp b/paddle/legacy/gserver/layers/ConvBaseLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d8997527fb1934b915bde2ae052159ea60ba302e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseLayer.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvBaseLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/utils/Logging.h"
+namespace paddle {
+
+bool ConvBaseLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv")
+                  ? false
+                  : true;
+
+  /* Initialize the convolutional layer parameter */
+  numFilters_ = config_.num_filters();
+  sharedBiases_ = config_.shared_biases();
+  for (auto& inputConfig : config_.inputs()) {
+    const ConvConfig& conf = inputConfig.conv_conf();
+    padding_.push_back(conf.padding());
+    stride_.push_back(conf.stride());
+    dilation_.push_back(conf.dilation());
+    filterSize_.push_back(conf.filter_size());
+    paddingY_.push_back(conf.padding_y());
+    strideY_.push_back(conf.stride_y());
+    dilationY_.push_back(conf.dilation_y());
+    filterSizeY_.push_back(conf.filter_size_y());
+    channels_.push_back(conf.channels());
+    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
+                                              : conf.img_size());
+    imgSizeW_.push_back(conf.img_size());
+    groups_.push_back(conf.groups());
+    filterChannels_.push_back(conf.filter_channels());
+    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
+    outputW_.push_back(conf.output_x());
+
+    paddingZ_.push_back(conf.padding_z());
+    strideZ_.push_back(conf.stride_z());
+    filterSizeZ_.push_back(conf.filter_size_z());
+    imgSizeD_.push_back(conf.img_size_z());
+    outputD_.push_back(conf.output_z());
+    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
+                            filterSizeZ_.back());
+  }
+
+  CHECK(inputLayers_.size() == parameters_.size());
+
+  // create new weights_ in derived class
+  // create new biases_ in derived class
+
+  // default caffe model
+  caffeMode_ = true;
+
+  return true;
+}
+
+size_t ConvBaseLayer::calOutputSize() {
+  auto clearAndReserve = [this](IntV* vec) {
+    vec->clear();
+    vec->reserve(this->inputLayers_.size());
+  };
+  clearAndReserve(&imgSizeH_);
+  clearAndReserve(&imgSizeW_);
+  clearAndReserve(&outputH_);
+  clearAndReserve(&outputW_);
+  size_t layerSize = 0;
+
+  auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
+    size_t filterSizeY;
+    size_t filterSize;
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      filterSizeY = (filterSizeY_[i] - 1) * dilationY_[i] + 1;
+      filterSize = (filterSize_[i] - 1) * dilation_[i] + 1;
+      inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
+      inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
+      const ConvConfig& conf = config_.inputs(i).conv_conf();
+      if (isDeconv_) {
+        if (inH[i] == 0)
+          inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
+        if (inW[i] == 0) inW[i] = conf.output_x();
+        outH.push_back(imageSize(
+            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(
+            imageSize(inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
+      } else {
+        if (inH[i] == 0)
+          inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+        if (inW[i] == 0) inW[i] = conf.img_size();
+        outH.push_back(outputSize(
+            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(outputSize(
+            inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
+      }
+      CHECK_EQ(outH[i], outH[0]);
+      CHECK_EQ(outW[i], outW[0]);
+    }
+    getOutput().setFrameHeight(outH[0]);
+    getOutput().setFrameWidth(outW[0]);
+    layerSize = outH[0] * outW[0] * size_t(numFilters_);
+  };
+
+  setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
+
+  return layerSize;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseLayer.h b/paddle/legacy/gserver/layers/ConvBaseLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..01e90e999625f986b0f13d2b73a883297c097841
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseLayer.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/MathUtils.h"
+namespace paddle {
+
+/**
+ * @brief A Base Convolution Layer, which convolves the input image
+ * with learned filters and (optionally) adds biases.
+ */
+
+class ConvBaseLayer : public Layer {
+ protected:
+  typedef std::vector<int> IntV;
+
+  /// True if it's deconv layer, false if it's convolution layer
+  bool isDeconv_;
+
+  /// The number of filters.
+  int numFilters_;
+  /// The x dimension of the padding.
+  IntV padding_;
+  /// The y dimension of the padding.
+  IntV paddingY_;
+  /// The x dimension of the stride.
+  IntV stride_;
+  /// The y dimension of the stride.
+  IntV strideY_;
+  /// The x dimension of the dilation.
+  IntV dilation_;
+  /// The y dimension of the dilation.
+  IntV dilationY_;
+  /// The x dimension of a filter kernel.
+  IntV filterSize_;
+  /// The y dimension of a filter kernel.
+  IntV filterSizeY_;
+  /// The spatial dimensions of the convolution input.
+  IntV channels_;
+  /// The spatial dimensions of input feature map height.
+  IntV imgSizeH_;
+  /// The spatial dimensions of input feature map width.
+  IntV imgSizeW_;
+  /// filterPixels_ = filterSizeX_ * filterSizeY_.
+  IntV filterPixels_;
+  /// filterChannels_ = channels_/groups_.
+  IntV filterChannels_;
+  /// The spatial dimensions of output feature map height.
+  IntV outputH_;
+  /// The spatial dimensions of output feature map width.
+  IntV outputW_;
+
+  IntV outputD_;
+  IntV imgSizeD_;
+  IntV filterSizeZ_;
+  IntV strideZ_;
+  IntV paddingZ_;
+
+  /// Group size, refer to grouped convolution in
+  /// Alex Krizhevsky's paper: when group=2, the first half of the
+  /// filters are only connected to the first half of the input channels,
+  /// and the second half only connected to the second half.
+  IntV groups_;
+  /// Whether the bias is shared for feature in each channel.
+  bool sharedBiases_;
+
+  /// shape of weight: (numChannels * filterPixels_, numFilters)
+  WeightList weights_;
+  /// If shared_biases is false shape of bias: (numFilters_, 1)
+  /// If shared_biases is ture shape of bias:
+  /// (numFilters_ * outputX * outputY, 1)
+  std::unique_ptr<Weight> biases_;
+
+  /// True by default. The only difference is the calculation
+  /// of output size.
+  bool caffeMode_;
+
+ public:
+  explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  /**
+   * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
+   * in this function. Then it will calculate outputH_ and outputW_ and set them
+   * into output argument.
+   */
+  virtual size_t calOutputSize();
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseOperator.cpp b/paddle/legacy/gserver/layers/ConvBaseOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8e59b3bfe9d8a9e54e5c11906707d10ec346a4d
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseOperator.cpp
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvBaseOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvBaseOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+ConvBaseOperator::ConvBaseOperator(const OperatorConfig &config, bool useGpu)
+    : Operator(config, useGpu) {
+  CHECK(useGpu);
+  CHECK_EQ(config_.input_indices_size(), 2L);
+
+  caffeMode_ = true;
+  getConvParams();
+  computeConvSizes();
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+  workSpace_ = nullptr;
+
+  isSelectAlgo_ = false;
+}
+
+void ConvBaseOperator::allocConvWorkSpace() {
+  hl_conv_workspace(imageDesc_,
+                    outputDesc_,
+                    filterDesc_,
+                    convDesc_,
+                    &fwdAlgo_,
+                    &fwdLimitBytes_,
+                    &bwdDataAlgo_,
+                    &bwdDataLimitBytes_,
+                    &bwdFilterAlgo_,
+                    &bwdFilterLimitBytes_,
+                    /*useDilation*/ false);
+
+  size_t maxWorkSpace = 0;
+  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+
+  if (maxWorkSpace > workSpaceInBytes_) {
+    if (workSpaceInBytes_ != 0) {
+      hl_free_mem_device(workSpace_);
+    }
+    // total amount of storage needed
+    workSpace_ = hl_malloc_device(maxWorkSpace);
+    workSpaceInBytes_ = maxWorkSpace;
+  }
+}
+
+void ConvBaseOperator::computeConvSizes() {
+  hl_create_filter_descriptor(
+      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
+  hl_create_tensor_descriptor(&imageDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   imageDesc_,
+                                   filterDesc_,
+                                   paddingY_,
+                                   padding_,
+                                   strideY_,
+                                   stride_);
+}
+
+void ConvBaseOperator::reshapeImageDescriptors() {
+  hl_tensor_reshape(imageDesc_,
+                    1,
+                    channels_,
+                    imageH_,
+                    imageW_,
+                    channels_ * imageH_ * imageW_,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+  hl_tensor_reshape(outputDesc_,
+                    1,
+                    numFilters_,
+                    outputH_,
+                    outputW_,
+                    numFilters_ * outputH_ * outputW_,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+  hl_reset_convolution_descriptor(convDesc_,
+                                  imageDesc_,
+                                  filterDesc_,
+                                  paddingY_,
+                                  padding_,
+                                  strideY_,
+                                  stride_);
+}
+
+void ConvBaseOperator::getConvParams() {
+  configNumFilters_ = config_.num_filters();
+  const ConvConfig &conf = config_.conv_conf();
+  padding_ = conf.padding();
+  stride_ = conf.stride();
+  filterSize_ = conf.filter_size();
+  paddingY_ = conf.padding_y();
+  strideY_ = conf.stride_y();
+  filterSizeY_ = conf.filter_size_y();
+  filterPixels_ = filterSize_ * filterSizeY_;
+  configChannels_ = conf.channels();
+  imgSize_ = conf.img_size();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  imgPixels_ = imgSize_ * imgSizeY_;
+  CHECK_EQ(conf.groups(), 1U);
+  filterChannels_ = conf.filter_channels();
+  outputX_ = conf.output_x();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  outputs_ = outputX_ * outputX_;
+
+  isDeconv_ = (config_.type() == "conv") ? false : true;
+  if (isDeconv_) {
+    channels_ = configNumFilters_;
+    numFilters_ = configChannels_;
+  } else {
+    channels_ = configChannels_;
+    numFilters_ = configNumFilters_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseOperator.h b/paddle/legacy/gserver/layers/ConvBaseOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ac77f2d743abd6f01e8e3f1e2f4e730c0e6fb39
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseOperator.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "Operator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvBaseOperator : public Operator {
+ public:
+  ConvBaseOperator(const OperatorConfig &config, bool useGpu);
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvBaseOperator() {
+    if (workSpaceInBytes_ != 0) {
+      hl_free_mem_device(workSpace_);
+      workSpaceInBytes_ = 0;
+    }
+
+    hl_destroy_tensor_descriptor(imageDesc_);
+    hl_destroy_tensor_descriptor(outputDesc_);
+    hl_destroy_filter_descriptor(filterDesc_);
+    hl_destroy_convolution_descriptor(convDesc_);
+  }
+
+ protected:
+  /**
+   * Get convolution parameters from layer config and
+   * initialize member variables.
+   */
+  void getConvParams();
+
+  /**
+   * Allocate Gpu Memory for cudnn convolution algorithms.
+   */
+  void allocConvWorkSpace();
+
+  /**
+   * Create cudnn tensor descriptor for convolution operation.
+   */
+  void computeConvSizes();
+
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  void reshapeImageDescriptors();
+
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  virtual void reshape(int batchSize) = 0;
+
+  /**
+   * Check filter size is equal to the size calculated by parameters from
+   * layer config.
+   */
+  void checkFilterSize(const MatrixPtr &filter) {
+    CHECK_EQ(static_cast<int>(filter->getWidth()),
+             filterSize_ * filterSizeY_ * channels_ * numFilters_);
+  }
+
+  /// Most of member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  bool isDeconv_;
+  int imageH_, imageW_, outputH_, outputW_;
+  hl_tensor_descriptor imageDesc_;
+  hl_tensor_descriptor outputDesc_;
+  hl_filter_descriptor filterDesc_;
+  hl_convolution_descriptor convDesc_;
+  bool caffeMode_;
+  int inputOffset_, outputOffset_, weightOffset_;
+  int numFilters_, channels_;
+
+  /// from parsing config
+  int configNumFilters_, configChannels_;
+  int padding_, stride_, filterSize_, imgSize_, imgSizeY_;
+  int paddingY_, strideY_, filterSizeY_;
+  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
+
+  /// Following member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
+  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
+  size_t workSpaceInBytes_;
+  void *workSpace_;
+  bool isSelectAlgo_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/legacy/gserver/layers/ConvBaseProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/ConvBaseProjection.cpp
rename to paddle/legacy/gserver/layers/ConvBaseProjection.cpp
diff --git a/paddle/legacy/gserver/layers/ConvBaseProjection.h b/paddle/legacy/gserver/layers/ConvBaseProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcf5ce0f48daac396bab0ec7620303f6c1236fc2
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseProjection.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Projection.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Base class for ConvProjection and ConvTransProjection.
+ */
+class ConvBaseProjection : public Projection {
+ public:
+  /**
+   * Constructor.
+   */
+  ConvBaseProjection(const ProjectionConfig& config,
+                     ParameterPtr parameter,
+                     bool useGpu);
+
+  ~ConvBaseProjection();
+
+ protected:
+  void getConvParams();
+  void initCudnn();
+
+  void reshapeTensorDesc(int batchSize);
+  void reshape(int batchSize);
+
+  virtual size_t calOutputSize() = 0;
+  virtual size_t calInputSize() = 0;
+
+  static void* getSpaceBytes(size_t size);
+
+  /// True if it's deconv projection layer, false if it's ConvProjection layer
+  bool isDeconv_;
+  /// imageH_ and imageW_ / outputH_ and outputW_
+  /// is calculated from the input layer.
+  int imageH_, imageW_;
+  int outputH_, outputW_;
+  /// configImgH_ and configImgW_ / configOutH_ and configOutW_
+  /// is obtained from config.
+  int configImgH_, configImgW_;
+  int configOutH_, configOutW_;
+  /// channels_ and numFilters_ are defined in terms of convolution semantics
+  int channels_, numFilters_;
+  /// configChannels and configNumFilters_ are obtained from config
+  /// For Conv they are the same as channels_ and numFilters
+  /// For ConvTrans they are opposite to channels_ and numFilters
+  int configChannels_, configNumFilters_;
+  int paddingH_, paddingW_;
+  int strideH_, strideW_;
+  int dilationH_, dilationW_;
+  int filterH_, filterW_;
+  /// One group offset of input data.
+  int inputOffset_;
+  /// One group offset of output data.
+  int outputOffset_;
+  /// One group offset of weight.
+  int weightOffset_;
+  int groups_;
+
+  /// Cudnn tensor descriptor for input.
+  hl_tensor_descriptor imageDesc_;
+  /// Cudnn tensor descriptor for output.
+  hl_tensor_descriptor outputDesc_;
+  /// Cudnn tensor descriptor for filter.
+  hl_filter_descriptor filterDesc_;
+  /// Cudnn tensor descriptor for a convolution operation.
+  hl_convolution_descriptor convDesc_;
+
+  /// Record the algorithm for forward convolution, which is obtained by cudnn
+  /// api to search the best suited algorithm.
+  int fwdAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// filter coefficients.
+  int bwdFilterAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// the output.
+  int bwdDataAlgo_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// forward convolution with the specified algo.
+  size_t fwdLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardFilter with the specified algo.
+  size_t bwdDataLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardData with the specified algo.
+  size_t bwdFilterLimitBytes_;
+  /// Size of total work space.
+  size_t workSpaceInBytes_;
+  bool bias_;
+
+  std::unique_ptr<Weight> weight_;
+  static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvOperator.cpp b/paddle/legacy/gserver/layers/ConvOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5276b2c3920eee923f13a47d40b4498c6846f94b
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvOperator.cpp
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+REGISTER_OPERATOR(conv, ConvOperator);
+
+void ConvOperator::reshape(int batchSize) {
+  imageH_ = ins_[0]->getFrameHeight();
+  imageW_ = ins_[0]->getFrameWidth();
+  if (imageH_ == 0) imageH_ = imgSizeY_;
+  if (imageW_ == 0) imageW_ = imgSize_;
+  outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
+  outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
+  /// Check that the outputSizes are consistent with config
+  CHECK_EQ(outputH_, outputY_);
+  CHECK_EQ(outputW_, outputX_);
+  out_->setFrameHeight(outputH_);
+  out_->setFrameWidth(outputW_);
+
+  reshapeImageDescriptors();
+
+  inputOffset_ = channels_ * imageH_ * imageW_;
+  outputOffset_ = numFilters_ * outputH_ * outputW_;
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
+
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace();
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvOperator::forward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  reshape(batchSize);
+  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
+  checkFilterSize(ins_[1]->value);
+  Matrix::resizeOrCreate(out_->value,
+                         batchSize,
+                         outputH_ * outputW_ * numFilters_,
+                         false,
+                         useGpu_);
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+      real *outData = out_->value->getData() + outputOffset_ * batchId;
+      hl_convolution_forward(imageDesc_,
+                             inputData,
+                             outputDesc_,
+                             outData,
+                             filterDesc_,
+                             wgtData,
+                             convDesc_,
+                             workSpace_,
+                             workSpaceInBytes_,
+                             fwdAlgo_);
+    }
+  }
+}
+
+void ConvOperator::backward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
+      if (ins_[1]->grad) {
+        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_filter(imageDesc_,
+                                       inputData,
+                                       outputDesc_,
+                                       outGrad,
+                                       filterDesc_,
+                                       weightGrad,
+                                       convDesc_,
+                                       workSpace_,
+                                       workSpaceInBytes_,
+                                       bwdFilterAlgo_);
+      }
+
+      MatrixPtr preGrad = ins_[0]->grad;
+      if (NULL != preGrad) {
+        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
+        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_data(imageDesc_,
+                                     inputGrad,
+                                     outputDesc_,
+                                     outGrad,
+                                     filterDesc_,
+                                     wgtData,
+                                     convDesc_,
+                                     workSpace_,
+                                     workSpaceInBytes_,
+                                     bwdDataAlgo_);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvOperator.h b/paddle/legacy/gserver/layers/ConvOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f31620111c8ff3818d83145e16012d22b067a12
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvOperator.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "ConvBaseOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvOperator : public ConvBaseOperator {
+ public:
+  ConvOperator(const OperatorConfig &config, bool useGpu)
+      : ConvBaseOperator(config, useGpu) {}
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvOperator() {}
+  void forward() override;
+  void backward() override;
+  void reshape(int batchSize) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/legacy/gserver/layers/ConvProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/ConvProjection.cpp
rename to paddle/legacy/gserver/layers/ConvProjection.cpp
diff --git a/paddle/legacy/gserver/layers/ConvProjection.h b/paddle/legacy/gserver/layers/ConvProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..890a17e2f8d2d05001f825f374e8ab6420f7b3ea
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvProjection.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ConvBaseProjection.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvProjection : public ConvBaseProjection {
+ public:
+  /**
+   * Constructor.
+   */
+  ConvProjection(const ProjectionConfig& config,
+                 ParameterPtr parameter,
+                 bool useGpu)
+      : ConvBaseProjection(config, parameter, useGpu) {}
+
+  ~ConvProjection() {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+  virtual size_t calOutputSize();
+  virtual size_t calInputSize();
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvShiftLayer.cpp b/paddle/legacy/gserver/layers/ConvShiftLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dda1a91e450fe4d1636a6c9af9a15e473b517983
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvShiftLayer.cpp
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for circular convluation of two vectors,
+ * which is used in NEURAL TURING MACHINE.
+ * - Input: two vectors, the first is data (batchSize x dataDim)
+ * the second is shift weights (batchSize x shiftDim)
+ * - Output: a vector (batchSize x dataDim)
+ * Assumed that:
+ * - a[in]: contains M elements.
+ * - b[in]: contains N elements (N should be odd).
+ * - c[out]: contains M elements.
+ *
+ * \f[
+ *     c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
+ * \f]
+ *
+ * In this formula:
+ *  - a's index is computed modulo M.
+ *  - b's index is comupted modulo N.
+ *
+ * The config file api is conv_shift_layer.
+ */
+
+class ConvShiftLayer : public Layer {
+ public:
+  explicit ConvShiftLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ConvShiftLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(conv_shift, ConvShiftLayer);
+
+bool ConvShiftLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  return true;
+}
+
+void ConvShiftLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t dataDim = inV0->getWidth();
+
+  CHECK_EQ(batchSize, inV1->getHeight());
+  CHECK_EQ(dataDim, getSize());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  REGISTER_TIMER_INFO("FwConvShiftTimer", getName().c_str());
+  outV->circularConv(*inV0, *inV1);
+}
+
+void ConvShiftLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  REGISTER_TIMER_INFO("BwConvShiftTimer", getName().c_str());
+
+  if (inG0 && inG1) {
+    outG->circularConvDerivative(*outG, *inV0, *inV1, *inG0, *inG1);
+  } else {
+    CHECK(!inG0 || !inG1) << "Not supported";
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransOperator.cpp b/paddle/legacy/gserver/layers/ConvTransOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4ce2affb144152ed41a9d4be9fa87f800c83dbb
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvTransOperator.cpp
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvTransOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvTransOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+REGISTER_OPERATOR(convt, ConvTransOperator);
+
+void ConvTransOperator::reshape(int batchSize) {
+  outputH_ = ins_[0]->getFrameHeight();
+  outputW_ = ins_[0]->getFrameWidth();
+  if (outputH_ == 0) outputH_ = outputY_;
+  if (outputW_ == 0) outputW_ = outputX_;
+  imageH_ = imageSize(outputH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
+  imageW_ = imageSize(outputW_, filterSize_, padding_, stride_, caffeMode_);
+  /// Check that the imageSizes are consistent with config
+  CHECK_EQ(imageH_, imgSizeY_);
+  CHECK_EQ(imageW_, imgSize_);
+  out_->setFrameHeight(imageH_);
+  out_->setFrameWidth(imageW_);
+
+  reshapeImageDescriptors();
+
+  inputOffset_ = numFilters_ * outputH_ * outputW_;
+  outputOffset_ = channels_ * imageH_ * imageW_;
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
+
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace();
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvTransOperator::forward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  reshape(batchSize);
+  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
+  checkFilterSize(ins_[1]->value);
+  Matrix::resizeOrCreate(
+      out_->value, batchSize, imageH_ * imageW_ * channels_, false, useGpu_);
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+      real *outData = out_->value->getData() + outputOffset_ * batchId;
+      hl_convolution_backward_data(imageDesc_,
+                                   outData,
+                                   outputDesc_,
+                                   inputData,
+                                   filterDesc_,
+                                   wgtData,
+                                   convDesc_,
+                                   workSpace_,
+                                   workSpaceInBytes_,
+                                   bwdDataAlgo_);
+    }
+  }
+}
+
+void ConvTransOperator::backward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
+      if (ins_[1]->grad) {
+        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_filter(imageDesc_,
+                                       outGrad,
+                                       outputDesc_,
+                                       inputData,
+                                       filterDesc_,
+                                       weightGrad,
+                                       convDesc_,
+                                       workSpace_,
+                                       workSpaceInBytes_,
+                                       bwdFilterAlgo_);
+      }
+
+      MatrixPtr preGrad = ins_[0]->grad;
+      if (NULL != preGrad) {
+        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
+        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+        hl_convolution_forward(imageDesc_,
+                               outGrad,
+                               outputDesc_,
+                               inputGrad,
+                               filterDesc_,
+                               wgtData,
+                               convDesc_,
+                               workSpace_,
+                               workSpaceInBytes_,
+                               fwdAlgo_);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransOperator.h b/paddle/legacy/gserver/layers/ConvTransOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..206335a01ff7509eaa5528002c6c9686f05c931b
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvTransOperator.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "ConvBaseOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvTransOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvTransOperator : public ConvBaseOperator {
+ public:
+  ConvTransOperator(const OperatorConfig &config, bool useGpu)
+      : ConvBaseOperator(config, useGpu) {}
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvTransOperator() {}
+  void forward() override;
+  void backward() override;
+  void reshape(int batchSize) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransProjection.cpp b/paddle/legacy/gserver/layers/ConvTransProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/ConvTransProjection.cpp
rename to paddle/legacy/gserver/layers/ConvTransProjection.cpp
diff --git a/paddle/legacy/gserver/layers/ConvTransProjection.h b/paddle/legacy/gserver/layers/ConvTransProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b63dd47352b9f24810d9406b314fbfa15ae13c3
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvTransProjection.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ConvBaseProjection.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvTransProjection : public ConvBaseProjection {
+ public:
+  /**
+   * Constructor.
+   */
+  ConvTransProjection(const ProjectionConfig& config,
+                      ParameterPtr parameter,
+                      bool useGpu)
+      : ConvBaseProjection(config, parameter, useGpu) {}
+
+  ~ConvTransProjection() {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+  virtual size_t calOutputSize();
+  virtual size_t calInputSize();
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp b/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..29a71fc1d9be2ff3c2688d647b0a1892631f3cc8
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for weighted sum of vectors,
+ * which is used in NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND
+ * TRANSLATE
+ * - Input: the the size of the first input is weightDim,
+ *          and the size of the second input is weightdim * dataDim.
+ * - Output: the sizeof the output is dataDim
+ * \f[
+ *   out(j) = \sum_{i}(in0(i) * in1(i,j + i * dataDim)),
+ *               i = 0,1,...,(weightDim-1); j = 0, 1,...,(dataDim-1)
+ * \f]
+ * Note that the above computation is for one sample. Multiple samples are
+ * processed in one batch.
+ *
+ * The config file api is linear_comb_layer.
+ */
+class ConvexCombinationLayer : public Layer {
+ protected:
+  /// A matrix pointer pointing to second input.
+  MatrixPtr tmpMtx0;
+  /// A matrix pointer pointing to first input.
+  MatrixPtr tmpRow0;
+  /// A matrix pointer pointing to output.
+  MatrixPtr tmpRow1;
+
+ public:
+  explicit ConvexCombinationLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ConvexCombinationLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(convex_comb, ConvexCombinationLayer);
+
+bool ConvexCombinationLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(2U, inputLayers_.size());
+  size_t dataDim = getSize();
+  size_t weightDim = inputLayers_[0]->getSize();
+
+  CHECK_EQ(weightDim * dataDim, inputLayers_[1]->getSize())
+      << "Dimension mismatch";
+
+  tmpRow0 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           weightDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow1 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ weightDim,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+
+  return true;
+}
+
+void ConvexCombinationLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t weightDim = inV0->getWidth();
+  size_t dataDim = getSize();
+
+  CHECK_EQ(batchSize, inV1->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  REGISTER_TIMER_INFO("FwCvxCombTimer", getName().c_str());
+  for (size_t i = 0; i < batchSize; i++) {
+    tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
+    tmpRow0->setData(inV0->getData() + i * weightDim);
+    tmpRow1->setData(outV->getData() + i * dataDim);
+
+    tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 0);
+  }
+}
+
+void ConvexCombinationLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t weightDim = inV0->getWidth();
+  size_t dataDim = getSize();
+
+  REGISTER_TIMER_INFO("BwCvxCombTimer", getName().c_str());
+
+  if (inG0) {
+    for (size_t i = 0; i < batchSize; i++) {
+      tmpRow0->setData(inG0->getData() + i * weightDim);
+      tmpRow1->setData(outG->getData() + i * dataDim);
+      tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
+
+      tmpRow0->mul(*tmpRow1, *(tmpMtx0->getTranspose()), 1, 1);
+    }
+  }
+
+  if (inG1) {
+    for (size_t i = 0; i < batchSize; i++) {
+      tmpRow0->setData(inV0->getData() + i * weightDim);
+      tmpRow1->setData(outG->getData() + i * dataDim);
+      tmpMtx0->setData(inG1->getData() + i * weightDim * dataDim);
+
+      tmpMtx0->mul(*(tmpRow0->getTranspose()), *tmpRow1, 1, 1);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/legacy/gserver/layers/CosSimLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/CosSimLayer.cpp
rename to paddle/legacy/gserver/layers/CosSimLayer.cpp
diff --git a/paddle/legacy/gserver/layers/CosSimLayer.h b/paddle/legacy/gserver/layers/CosSimLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e53de414d2f1f28627c831e9972ab6f7d1dd4ad
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CosSimLayer.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief A layer for calculating cosine similarity between two vector
+ * \f[
+ * f(x,y)=scale\frac{x_1y_1+x_2y_2+...+x_ny_n}{\sqrt{x_1^2+x_2^2+...
+ * +x_n^2}\sqrt{y_1^2+y_2^2+...+y_n^2}}
+ * \f]
+ *
+ * - Input1: A vector (batchSize * dataDim) *
+ * - Input2: A vector (batchSize * dataDim) or (1 * dataDim) *
+ * - Output: A vector (batchSize * 1)
+ *
+ * The config file api is cos_sim.
+ */
+class CosSimLayer : public Layer {
+ public:
+  explicit CosSimLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~CosSimLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp b/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..da3ddf11dc7430d0e41d633d426c20ff0c400151
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
@@ -0,0 +1,182 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+/**
+ * @brief A layer for computing cosine similarity between a vector
+ * and each row of a matrix
+ * out[i] = cos_scale * cos(in1, in2(i,:));
+ * @note used in NEURAL TURING MACHINE
+ *
+ * Input1: a vector (batchSize * dataDim)
+ *
+ * Input2: a matrix in vector form (batchSize * (weightDim*dataDim))
+ *
+ * Output: a vector (batchSize * weightDim)
+ */
+
+class CosSimVecMatLayer : public Layer {
+ protected:
+  MatrixPtr tmpMtx0;
+  MatrixPtr tmpMtx1;
+  MatrixPtr tmpRow0;
+  MatrixPtr tmpRow1;
+  MatrixPtr tmpRow2;
+  MatrixPtr tmpRow3;
+
+ public:
+  explicit CosSimVecMatLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~CosSimVecMatLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(cos_vm, CosSimVecMatLayer);
+
+bool CosSimVecMatLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  size_t dataDim = inputLayers_[0]->getSize();
+  size_t numKeys = getSize();
+  size_t memoryDim = inputLayers_[1]->getSize();
+
+  CHECK_EQ(dataDim * numKeys, memoryDim) << "Dimension mismatch";
+
+  tmpRow0 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow1 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow2 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           1,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow3 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           1,
+                           /* trans= */ false,
+                           useGpu_);
+
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpMtx1 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+
+  CHECK(tmpRow0 && tmpRow1 && tmpRow2 && tmpRow3 && tmpMtx0 && tmpMtx1);
+
+  createFunction(forward_,
+                 "CosSimForward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+  createFunction(backward_,
+                 "CosSimBackward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+
+  return true;
+}
+
+void CosSimVecMatLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t numKeys = getSize();
+
+  CHECK_EQ(batchSize, inV1->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, numKeys);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  CHECK(outV && inV0 && inV1);
+  REGISTER_TIMER_INFO("FwCosVMTimer", getName().c_str());
+  for (size_t i = 0; i < batchSize; i++) {
+    tmpRow0->setData(inV0->rowBuf(i));
+    tmpMtx0->setData(inV1->rowBuf(i));
+    tmpRow2->setData(outV->rowBuf(i));
+
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*tmpMtx0);
+    inputs.addArg(*tmpRow0);
+    outputs.addArg(*tmpRow2, ASSIGN_TO);
+    forward_[0]->calc(inputs, outputs);
+  }
+}
+
+void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
+  CHECK_EQ(backward_.size(), 1UL) << "Only one forward function needed";
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  size_t batchSize = inV0->getHeight();
+  CHECK(inV0 && inV1 && inG0 && inG1 && outV && outG);
+  REGISTER_TIMER_INFO("BwCosVMTimer", getName().c_str());
+
+  for (size_t i = 0; i < batchSize; i++) {
+    tmpRow0->setData(inV0->rowBuf(i));
+    tmpRow1->setData(inG0->rowBuf(i));
+    tmpMtx0->setData(inV1->rowBuf(i));
+    tmpMtx1->setData(inG1->rowBuf(i));
+    tmpRow2->setData(outV->rowBuf(i));
+    tmpRow3->setData(outG->rowBuf(i));
+
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*tmpRow3);
+    inputs.addArg(*tmpRow2);
+    inputs.addArg(*tmpMtx0);
+    inputs.addArg(*tmpRow0);
+    outputs.addArg(*tmpMtx1, ADD_TO);
+    outputs.addArg(*tmpRow1, ADD_TO);
+
+    backward_[0]->calc(inputs, outputs);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CostLayer.cpp b/paddle/legacy/gserver/layers/CostLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c0762be25eec702af350fefab7a65dd72dbf7af
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CostLayer.cpp
@@ -0,0 +1,748 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CostLayer.h"
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include "paddle/utils/Logging.h"
+
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+
+bool CostLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  bool ret = Layer::init(layerMap, parameterMap);
+  coeff_ = config_.coeff();
+  if (!ret) return ret;
+  CHECK_GE(inputLayers_.size(), 2UL);
+  CHECK_LE(inputLayers_.size(), 3UL);
+  if (inputLayers_.size() == 3) {
+    weightLayer_ = inputLayers_[2];
+  }
+  return true;
+}
+
+void CostLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(*getOutputLayer())->getHeight();
+  int size = 1;
+  resetOutput(batchSize, size);
+
+  const MatrixPtr& output = getInputValue(*getOutputLayer());
+  Argument label = getInput(*getLabelLayer());
+
+  /* get the cost value for each sample*/
+  forwardImp(*output, label, *getOutputValue());
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    getOutputValue()->dotMul(*getOutputValue(), *weight);
+  }
+}
+
+void CostLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  const Argument& output = getInput(*getOutputLayer());
+  Argument label = getInput(*getLabelLayer());
+
+  bool support = true;
+  if (weightLayer_) {
+    support = output.grad->getAbsSum() == 0;
+  }
+
+  backwardImp(*output.value, label, *output.grad);
+
+  if (weightLayer_) {
+    CHECK(support) << "Weighted cost layer '" << getName()
+                   << "' must be the last layer "
+                      "connected to the output layer '"
+                   << getOutputLayer()->getName() << "'";
+    output.grad->rowScale(0, *output.grad, *getInputValue(*weightLayer_));
+  }
+  if (coeff_ != real(1.0f)) {
+    output.grad->add(coeff_, 0);
+  }
+}
+
+//
+// class MultiClassCrossEntropy
+//
+bool MultiClassCrossEntropy::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void MultiClassCrossEntropy::forwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& target) {
+  target.oneHotCrossEntropy(output, *label.ids);
+}
+
+void MultiClassCrossEntropy::backwardImp(Matrix& output,
+                                         Argument& label,
+                                         Matrix& outputG) {
+  outputG.oneHotCrossEntropyBp(output, *label.ids);
+}
+
+//
+// class MultiClassCrossEntropyWithSelfNorm
+//
+REGISTER_LAYER(multi_class_cross_entropy_with_selfnorm,
+               MultiClassCrossEntropyWithSelfNorm);
+
+bool MultiClassCrossEntropyWithSelfNorm::init(
+    const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
+                                                    Argument& label,
+                                                    Matrix& target) {
+  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
+  output.rowSum(*sftMaxSum_);
+  sftMaxSum_->log2();
+
+  target.oneHotCrossEntropy(output, *label.ids);
+  target.add(*sftMaxSum_);
+
+  sftMaxSum_->square2();
+  target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
+}
+
+void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
+                                                     Argument& label,
+                                                     Matrix& outputG) {
+  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
+  output.rowSum(*sftMaxSum_);
+
+  Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
+  sftMaxSum_->reciprocal2(*sumInv_);
+
+  outputG.oneHotCrossEntropyBp(output, *label.ids);
+  outputG.addColumnVector(*sumInv_);
+
+  sftMaxSum_->log2();
+  sumInv_->dotMul(*sumInv_, *sftMaxSum_);
+  sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());
+
+  outputG.addColumnVector(*sumInv_);
+}
+
+//
+// class SoftBinaryClassCrossEntropy
+//
+REGISTER_LAYER(soft_binary_class_cross_entropy, SoftBinaryClassCrossEntropy);
+
+bool SoftBinaryClassCrossEntropy::init(const LayerMap& layerMap,
+                                       const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output,
+                                             Argument& label,
+                                             Matrix& target) {
+  Matrix::resizeOrCreate(
+      targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
+
+  targetPerDim_->softCrossEntropy(output, *label.value);
+  targetPerDim_->rowSum(target);
+}
+
+void SoftBinaryClassCrossEntropy::backwardImp(Matrix& output,
+                                              Argument& label,
+                                              Matrix& outputG) {
+  outputG.softCrossEntropyBp(output, *label.value);
+}
+
+//
+// class SumOfSquaresCostLayer
+//
+
+REGISTER_LAYER(square_error, SumOfSquaresCostLayer);
+
+bool SumOfSquaresCostLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void SumOfSquaresCostLayer::forwardImp(Matrix& output,
+                                       Argument& label,
+                                       Matrix& target) {
+  target.sumOfSquares(output, *label.value);
+}
+
+void SumOfSquaresCostLayer::backwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& outputG) {
+  outputG.sumOfSquaresBp(output, *label.value);
+}
+
+//
+// class SmoothL1CostLayer
+//
+
+REGISTER_LAYER(smooth_l1, SmoothL1CostLayer);
+
+bool SmoothL1CostLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void SmoothL1CostLayer::forwardImp(Matrix& output,
+                                   Argument& label,
+                                   Matrix& target) {
+  MatrixPtr targetCpu, outputCpu, labelCpu;
+  if (useGpu_) {
+    targetCpu =
+        Matrix::create(target.getHeight(), target.getWidth(), false, false);
+    outputCpu =
+        Matrix::create(output.getHeight(), output.getWidth(), false, false);
+    labelCpu = Matrix::create(
+        label.value->getHeight(), label.value->getWidth(), false, false);
+    targetCpu->copyFrom(target);
+    outputCpu->copyFrom(output);
+    labelCpu->copyFrom(*label.value);
+    targetCpu->smoothL1(*outputCpu, *labelCpu, 1.0);
+    target.copyFrom(*targetCpu);
+  } else {
+    target.smoothL1(output, *label.value, 1.0);
+  }
+}
+
+void SmoothL1CostLayer::backwardImp(Matrix& output,
+                                    Argument& label,
+                                    Matrix& outputG) {
+  MatrixPtr outputGCpu, outputCpu, labelCpu;
+  if (useGpu_) {
+    outputGCpu =
+        Matrix::create(outputG.getHeight(), outputG.getWidth(), false, false);
+    outputCpu =
+        Matrix::create(output.getHeight(), output.getWidth(), false, false);
+    labelCpu = Matrix::create(
+        label.value->getHeight(), label.value->getWidth(), false, false);
+    outputGCpu->copyFrom(outputG);
+    outputCpu->copyFrom(output);
+    labelCpu->copyFrom(*label.value);
+    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu, 1.0);
+    outputG.copyFrom(*outputGCpu);
+  } else {
+    outputG.smoothL1Bp(output, *label.value, 1.0);
+  }
+}
+
+//
+// class RankingCost
+//
+bool RankingCost::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  posPairCount_ = 0;
+  negPairCount_ = 0;
+
+  bool ret = Layer::init(layerMap, parameterMap);
+  if (!ret) return ret;
+  CHECK_GE(inputLayers_.size(), 3UL);
+  CHECK_LE(inputLayers_.size(), 4UL);
+  if (inputLayers_.size() == 4) {
+    weightLayer_ = inputLayers_[3];
+  }
+  return true;
+}
+
+void RankingCost::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(*getOutputLayer(0))->getHeight();
+  int size = 1;
+  resizeOutput(batchSize, size);
+  Matrix::resizeOrCreate(margin_, batchSize, size, /* trans= */ false, useGpu_);
+  MatrixPtr label = getInputValue(*getLabelLayer());
+  if (!label) {
+    // input label is not in value, try ids
+    IVectorPtr idLabel = getInput(*getLabelLayer()).ids;
+    CHECK(idLabel) << "label layer has neither value nor ids";
+    CHECK_EQ((size_t)batchSize, idLabel->getSize());
+    Matrix::resizeOrCreate(
+        labelBuf_, batchSize, /*width*/ 1, /*trans*/ false, useGpu_);
+    labelBuf_->copyFrom(*idLabel);
+    label = labelBuf_;
+  }
+
+  MatrixPtr output[] = {getInputValue(*getOutputLayer(0)),
+                        getInputValue(*getOutputLayer(1))};
+  MatrixPtr target = this->getOutputValue();
+  margin_->sub(*output[0], *output[1]);
+
+  // for validation
+  size_t height = output[0]->getHeight();
+  target->biggerThan(*(output[0]), *(output[1]), *label);
+  double total = static_cast<double>(height);
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    target->dotMul(*target, *weight);
+    total = weight->getSum();
+  }
+  double pos = target->getSum();
+  posPairCount_ += pos;
+  negPairCount_ += (total - pos);
+
+  // forward
+  target->logisticRegressionLoss(*margin_, *label);
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    target->dotMul(*target, *weight);
+  }
+}
+
+void RankingCost::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr label = getInputValue(*getLabelLayer());
+  if (!label) {
+    // input label is not in value, but in ids
+    // use labelBuf_ (should already resized and copied during forward)
+    label = labelBuf_;
+  }
+
+  Matrix::resizeOrCreate(
+      marginGrad_, label->getHeight(), 1, /* trans= */ false, useGpu_);
+  marginGrad_->zeroMem();
+  marginGrad_->logisticRegressionLossBp(*margin_, *label);
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    marginGrad_->dotMul(*marginGrad_, *weight);
+  }
+
+  getInputGrad(0)->add(*marginGrad_);
+  getInputGrad(1)->sub(*marginGrad_);
+}
+
+void RankingCost::onPassEnd() {
+  double ratio = posPairCount_ / ((negPairCount_ <= 0) ? 1.0 : negPairCount_);
+  LOG(INFO) << "calc pos/neg: " << ratio << " pos= " << posPairCount_
+            << " neg= " << negPairCount_;
+
+  posPairCount_ = 0;
+  negPairCount_ = 0;
+}
+
+//
+// class LambdaCost
+//
+REGISTER_LAYER(lambda_cost, LambdaCost);
+
+bool LambdaCost::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  truncationSize_ = config_.ndcg_num();
+  maxSortSize_ = config_.max_sort_size();
+  if (maxSortSize_ != -1) {
+    CHECK_GE(maxSortSize_, truncationSize_)
+        << "maxSortSize must be greater than or equal to NDCG size!";
+  }
+  LOG(INFO) << "LambdaRank v1.3, NDCG size = " << truncationSize_
+            << ", Max partial sort size = " << maxSortSize_;
+  CHECK(!useGpu_) << "LambdaRank supports CPU only!";
+  return Layer::init(layerMap, parameterMap);
+}
+
+void LambdaCost::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(*getOutputLayer())->getHeight();
+  resizeOutput(batchSize, 1);
+
+  MatrixPtr score = getInputValue(*getScoreLayer());
+  MatrixPtr output = getInputValue(*getOutputLayer());
+  MatrixPtr target = this->getOutputValue();
+
+  real* scoreData = score->getData();
+  real* outputData = output->getData();
+  real* targetData = target->getData();
+
+  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
+  const int* startPosData = startPos->getData(false);
+  size_t batchNum = startPos->getSize() - 1;
+  for (size_t i = 0; i < batchNum; ++i) {
+    int beginPos = startPosData[i];
+    int endPos = startPosData[i + 1];
+    real NDCG = calcNDCG(
+        outputData + beginPos, scoreData + beginPos, endPos - beginPos);
+    for (int j = beginPos; j < endPos; ++j) {
+      targetData[j] = NDCG;
+    }
+  }
+}
+
+void LambdaCost::backward(const UpdateCallback& callback) {
+  (void)callback;
+  MatrixPtr score = getInputValue(*getScoreLayer());
+  MatrixPtr output = getInputValue(*getOutputLayer());
+  Matrix::resizeOrCreate(marginGrad_,
+                         score->getHeight(),
+                         1,
+                         /* trans= */ false,
+                         useGpu_);
+  marginGrad_->zeroMem();
+
+  real* gradData = marginGrad_->getData();
+  real* scoreData = score->getData();
+  real* outputData = output->getData();
+
+  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
+  const int* startPosData = startPos->getData(false);
+  size_t batchNum = startPos->getSize() - 1;
+
+  for (size_t i = 0; i < batchNum; ++i) {
+    int beginPos = startPosData[i];
+    int endPos = startPosData[i + 1];
+    calcGrad(outputData + beginPos,
+             scoreData + beginPos,
+             gradData + beginPos,
+             endPos - beginPos);
+  }
+
+  getInputGrad(0)->add(*marginGrad_);
+}
+
+void LambdaCost::calcGrad(const real* outputScore,
+                          const real* score,
+                          real* gradData,
+                          int size) {
+  CHECK_GE(size, truncationSize_)
+      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
+  int sortSize = maxSortSize_ == -1 ? size : std::min(maxSortSize_, size);
+
+  scorePair_.clear();
+  for (int i = 0; i < size; ++i) {
+    scorePair_.push_back(std::make_pair(score[i], i));
+  }
+  if (size <= sortSize) {
+    std::sort(scorePair_.begin(),
+              scorePair_.end(),
+              [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+                return a.first > b.first;
+              });
+  } else {
+    std::partial_sort(
+        scorePair_.begin(),
+        scorePair_.begin() + sortSize,
+        scorePair_.end(),
+        [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+          return a.first > b.first;
+        });
+  }
+
+  real maxDCG = 0;
+  for (int i = 0; i < truncationSize_; ++i) {
+    maxDCG += (std::pow(2, scorePair_[i].first) - 1) / std::log(i + 2);
+  }
+  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
+
+  for (int i = 0; i < sortSize; ++i) {
+    for (int j = i + 1; j < size; ++j) {
+      int index_i = scorePair_[i].second;
+      int index_j = scorePair_[j].second;
+      real score_i = score[index_i];
+      real score_j = score[index_j];
+      real dcgDif = 0;
+      if (j < sortSize) {
+        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) *
+                 (1 / std::log(i + 2) - 1 / std::log(j + 2));
+      } else {
+        dcgDif =
+            (std::pow(2, score_i) - std::pow(2, score_j)) / std::log(i + 2);
+      }
+
+      real lambda_ij =
+          -std::abs(dcgDif) /
+          (1 + std::exp(outputScore[index_i] - outputScore[index_j]));
+      gradData[index_i] += lambda_ij / maxDCG;
+      gradData[index_j] -= lambda_ij / maxDCG;
+    }
+  }
+}
+
+real LambdaCost::calcNDCG(const real* outputScore,
+                          const real* score,
+                          int size) {
+  CHECK_GE(size, truncationSize_)
+      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
+
+  outputScorePair_.clear();
+  for (int i = 0; i < size; ++i) {
+    outputScorePair_.push_back(std::make_pair(outputScore[i], i));
+  }
+  std::partial_sort(
+      outputScorePair_.begin(),
+      outputScorePair_.begin() + truncationSize_,
+      outputScorePair_.end(),
+      [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+        return a.first > b.first;
+      });
+
+  real DCG = 0;
+  for (int i = 0; i < truncationSize_; ++i) {
+    DCG +=
+        (std::pow(2, score[outputScorePair_[i].second]) - 1) / std::log(i + 2);
+  }
+
+  scoreVec_.resize(size);
+  std::copy(score, score + size, scoreVec_.begin());
+  real maxDCG = 0;
+  std::partial_sort(scoreVec_.begin(),
+                    scoreVec_.begin() + truncationSize_,
+                    scoreVec_.end(),
+                    std::greater<real>());
+  for (int i = 0; i < truncationSize_; ++i) {
+    maxDCG += (std::pow(2, scoreVec_[i]) - 1) / std::log(i + 2);
+  }
+  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
+
+  return DCG / maxDCG;
+}
+
+//
+// class MultiBinaryLabelCrossEntropy
+//
+
+REGISTER_LAYER(multi_binary_label_cross_entropy, MultiBinaryLabelCrossEntropy);
+
+bool MultiBinaryLabelCrossEntropy::init(const LayerMap& layerMap,
+                                        const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output,
+                                              Argument& label,
+                                              Matrix& target) {
+  MatrixPtr value = nullptr;
+  if (label.ids) {
+    CHECK(!label.value);
+    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
+  } else {
+    CHECK(label.value);
+    value = label.value;
+  }
+
+  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
+      dynamic_cast<GpuSparseMatrix*>(value.get())) {
+    target.multiBinaryLabelCrossEntropy(output, *value);
+  } else {
+    Matrix::resizeOrCreate(
+        targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
+
+    targetPerDim_->binaryLabelCrossEntropy(output, *value);
+    targetPerDim_->rowSum(target);
+  }
+}
+
+void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
+                                               Argument& label,
+                                               Matrix& outputG) {
+  MatrixPtr value = nullptr;
+  if (label.ids) {
+    CHECK(!value);
+    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
+  } else {
+    CHECK(label.value);
+    value = label.value;
+  }
+
+  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
+      dynamic_cast<GpuSparseMatrix*>(value.get())) {
+    outputG.multiBinaryLabelCrossEntropyBp(output, *value);
+  } else {
+    outputG.binaryLabelCrossEntropyBp(output, *value);
+  }
+}
+
+bool HuberCost::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  CostLayer::init(layerMap, parameterMap);
+  if (useGpu_) {
+    tmpCpuInput_.reserve(inputLayers_.size());
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      tmpCpuInput_.push_back(Argument());
+    }
+  }
+  return true;
+}
+
+void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
+  if (useGpu_) {
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      tmpCpuInput_[i].resizeAndCopyFrom(
+          getInput(i), false, HPPL_STREAM_DEFAULT);
+    }
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  }
+}
+
+//
+// Huber loss for robust regression.
+//
+REGISTER_LAYER(huber_regression, HuberRegressionLoss);
+
+bool HuberRegressionLoss::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  HuberCost::init(layerMap, parameterMap);
+  delta_ = config_.delta();
+  return true;
+}
+
+void HuberRegressionLoss::forwardImp(Matrix& output,
+                                     Argument& label,
+                                     Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
+  size_t numSamples = target.getHeight();
+  size_t dim = output.getWidth();
+  CHECK(label.value);
+  CHECK_EQ((*label.value).getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(dim, (*label.value).getWidth());
+  CHECK_EQ(target.getWidth(), (size_t)1);
+
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  std::vector<real> cost(numSamples, 0);
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = std::abs(lbl[index] - out[index]);
+      if (a <= delta_)
+        cost[i] += a * a / 2;
+      else
+        cost[i] += delta_ * (a - delta_ / 2);
+    }
+  }
+  target.copyFrom(cost.data(), numSamples);
+}
+
+void HuberRegressionLoss::backwardImp(Matrix& output,
+                                      Argument& label,
+                                      Matrix& outputG) {
+  size_t numSamples = output.getHeight();
+  size_t dim = output.getWidth();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = lbl[index] - out[index];
+      if (std::abs(a) <= delta_)
+        grad[index] += -a;
+      else
+        grad[index] += a > 0 ? -delta_ : delta_;
+    }
+  }
+  if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
+}
+
+//
+// Huber loss for robust 2-classes classification
+//
+REGISTER_LAYER(huber_classification, HuberTwoClassification);
+
+bool HuberTwoClassification::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  return HuberCost::init(layerMap, parameterMap);
+}
+
+void HuberTwoClassification::forwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
+  size_t numSamples = target.getHeight();
+  CHECK(label.ids);
+  CHECK_EQ((*label.ids).getSize(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), (size_t)1);
+  CHECK_EQ(target.getWidth(), (size_t)1);
+
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
+  std::vector<real> cost(numSamples, 0);
+  for (size_t i = 0; i < numSamples; ++i) {
+    int y = 2 * lbl[i] - 1;
+    real a = out[i] * y;
+    if (a < -1)
+      cost[i] = -4 * a;
+    else if (a < 1)
+      cost[i] = (1 - a) * (1 - a);
+  }
+  target.copyFrom(cost.data(), numSamples);
+}
+
+void HuberTwoClassification::backwardImp(Matrix& output,
+                                         Argument& label,
+                                         Matrix& outputG) {
+  size_t numSamples = output.getHeight();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    int y = 2 * lbl[i] - 1;
+    real a = out[i] * y;
+    if (a < -1)
+      grad[i] += -4 * y;
+    else if (a < 1)
+      grad[i] += -2 * (1 - a) * y;
+  }
+  if (useGpu_) outputG.copyFrom(grad, numSamples);
+}
+/**
+ * This cost layer compute the sum of its input as loss.
+ * \f[
+ * o(i) = \sum_{j=1}^D y_{ij}
+ * \f]
+ */
+class SumCostLayer : public Layer {
+ public:
+  explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    bool ret = Layer::init(layerMap, parameterMap);
+    if (!ret) return ret;
+    CHECK_EQ(inputLayers_.size(), 1UL);
+    return true;
+  }
+
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+    const MatrixPtr& input = getInputValue(0);
+
+    /* malloc memory for the output_ if necessary */
+    int batchSize = input->getHeight();
+    int size = 1;
+    resizeOutput(batchSize, size);
+    output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0);
+  }
+
+  void backward(const UpdateCallback& callback = nullptr) override {
+    getInputGrad(0)->add((real)1);
+  }
+};
+
+REGISTER_LAYER(sum_cost, SumCostLayer);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/legacy/gserver/layers/CostLayer.h
similarity index 100%
rename from paddle/gserver/layers/CostLayer.h
rename to paddle/legacy/gserver/layers/CostLayer.h
diff --git a/paddle/gserver/layers/CropLayer.cpp b/paddle/legacy/gserver/layers/CropLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/CropLayer.cpp
rename to paddle/legacy/gserver/layers/CropLayer.cpp
diff --git a/paddle/gserver/layers/CropLayer.h b/paddle/legacy/gserver/layers/CropLayer.h
similarity index 100%
rename from paddle/gserver/layers/CropLayer.h
rename to paddle/legacy/gserver/layers/CropLayer.h
diff --git a/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp b/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0fe100a96c01713f6c8d10d4eff428e7e743b002
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp
@@ -0,0 +1,137 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "NormLayer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+MatrixPtr CrossChannelNormLayer::createSampleMatrix(MatrixPtr data,
+                                                    size_t iter,
+                                                    size_t spatialDim) {
+  return Matrix::create(data->getData() + iter * channels_ * spatialDim,
+                        channels_,
+                        spatialDim,
+                        false,
+                        useGpu_);
+}
+
+MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
+                                                     size_t iter,
+                                                     size_t spatialDim) {
+  return Matrix::create(
+      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
+}
+
+bool CrossChannelNormLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
+
+void CrossChannelNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inV = getInputValue(0);
+
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = inV->getWidth();
+  CHECK_EQ(getSize(), dataDim);
+
+  reserveOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+  size_t spatialDim = dataDim / channels_;
+
+  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
+  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
+  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
+
+  inV->square2(*dataBuffer_);
+  for (size_t i = 0; i < batchSize; i++) {
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    MatrixPtr outVTmp = createSampleMatrix(outV, i, spatialDim);
+    MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+
+    // compute norm.
+    spatialBuffer_->sumCols(*dataTmp, 1, 0);
+    // add eps to avoid overflow
+    spatialBuffer_->add(1e-6);
+    spatialBuffer_->sqrt2(*spatialBuffer_);
+    normTmp->copyFrom(*spatialBuffer_);
+    outVTmp->copyFrom(*inVTmp);
+    outVTmp->divRowVector(*spatialBuffer_);
+    // scale the layer.
+    outVTmp->mulColVector(*scale_->getW());
+  }
+}
+
+void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr outV = getOutputValue();
+
+  size_t batchSize = inG->getHeight();
+  size_t dataDim = inG->getWidth();
+  size_t spatialDim = dataDim / channels_;
+
+  MatrixPtr inGBuffer;
+  Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
+
+  dataBuffer_->dotMul(*outG, *outV);
+  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
+  scaleDiff_->zeroMem();
+  for (size_t i = 0; i < batchSize; i++) {
+    MatrixPtr outGTmp = createSampleMatrix(outG, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr inGTmp = createSampleMatrix(inG, i, spatialDim);
+    const MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+
+    channelBuffer_->sumRows(*dataTmp, 1, 0);
+    channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
+    // store a / scale[i] in scaleDiff_ temporary
+    scaleDiff_->add(*channelBuffer_, 1.);
+
+    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
+    spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
+    // scale the grad
+    inGBuffer->copyFrom(*inVTmp);
+    inGBuffer->mulRowVector(*spatialBuffer_);
+    // divide by square of norm
+    spatialBuffer_->dotMul(*normTmp, *normTmp);
+    inGBuffer->divRowVector(*spatialBuffer_);
+    // subtract
+    inGBuffer->add(*outGTmp, -1, 1);
+    // divide by norm
+    inGBuffer->divRowVector(*normTmp);
+    // scale the diff
+    inGBuffer->mulColVector(*scale_->getW());
+
+    inGTmp->add(*inGBuffer);
+  }
+  // updata scale
+  if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
+  scale_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/legacy/gserver/layers/CrossEntropyOverBeam.cpp
similarity index 100%
rename from paddle/gserver/layers/CrossEntropyOverBeam.cpp
rename to paddle/legacy/gserver/layers/CrossEntropyOverBeam.cpp
diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/legacy/gserver/layers/CrossEntropyOverBeam.h
similarity index 100%
rename from paddle/gserver/layers/CrossEntropyOverBeam.h
rename to paddle/legacy/gserver/layers/CrossEntropyOverBeam.h
diff --git a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3f4e17c0188c7d68965f43148ce29a38dacbf809
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CudnnBatchNormLayer.h"
+#include "Layer.h"
+#include "paddle/legacy/cuda/include/hl_batch_norm.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
+
+bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
+  CHECK(useGpu_) << "CudnnBatchNorm only support GPU";
+
+  hl_create_tensor_descriptor(&ioDesc_);
+  hl_create_tensor_descriptor(&bnParamDesc_);
+  hl_tensor_reshape(bnParamDesc_, 1, channels_, 1, 1);
+
+  return true;
+}
+
+void CudnnBatchNormLayer::reshape(int batchSize) {
+  hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_ * imageD_, imageW_);
+}
+
+void CudnnBatchNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInputValue(0)->getHeight();
+  calFeatureMapSize();
+  reshape(batchSize);
+  resetOutput(batchSize, getInputValue(0)->getWidth());
+
+  // for testing in training peroid.
+  useGlobalStats_ = (passType == PASS_TEST);
+  if (passType == PASS_TEST && config_.has_use_global_stats()) {
+    useGlobalStats_ = config_.use_global_stats();
+  }
+
+  real* input = getInputValue(0)->getData();
+  real* output = getOutputValue()->getData();
+  real* gamma = weight_->getW()->getData();
+  real* beta = biases_->getW()->getData();
+  real* movingMean = movingMean_->getW()->getData();
+  real* movingVar = movingVar_->getW()->getData();
+
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
+
+  if (!useGlobalStats_) {
+    REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
+    real* savedMean = savedMean_->getData();
+    real* savedInvVar = savedInvVar_->getData();
+    hl_batch_norm_forward_training(ioDesc_,
+                                   input,
+                                   ioDesc_,
+                                   output,
+                                   bnParamDesc_,
+                                   gamma,
+                                   beta,
+                                   1.0 - movingAvgFraction_,
+                                   movingMean,
+                                   movingVar,
+                                   eps_,
+                                   savedMean,
+                                   savedInvVar);
+  } else {
+    // used movingMean and movingVar in testing
+    if (batchSize <= 1024) {
+      hl_batch_norm_forward_inference(ioDesc_,
+                                      input,
+                                      ioDesc_,
+                                      output,
+                                      bnParamDesc_,
+                                      gamma,
+                                      beta,
+                                      movingMean,
+                                      movingVar,
+                                      eps_);
+    } else {
+      // There is a limitation in cudnn library.
+      // When the batch size is larger than 1024 in cuDNN v5.1,
+      // the cudnnBatchNormalizationForwardInference will fail.
+      hl_batch_norm_cuda_inference(input,
+                                   output,
+                                   gamma,
+                                   beta,
+                                   movingMean,
+                                   movingVar,
+                                   eps_,
+                                   batchSize,
+                                   channels_,
+                                   imageH_ * imageD_,
+                                   imageW_);
+    }
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  real* input = getInputValue(0)->getData();
+  real* outGrad = getOutputGrad()->getData();
+  real* inGrad = getInputGrad(0)->getData();
+  real* gamma = weight_->getW()->getData();
+  real* savedMean = savedMean_->getData();
+  real* savedInvVar = savedInvVar_->getData();
+
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
+
+  auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
+    Matrix::resizeOrCreate(m, h, w, false, true);
+    m->zeroMem();
+    *p = m->getData();
+  };
+
+  real* gammaGrad = nullptr;
+  real* betaGrad = nullptr;
+  if (weight_->getWGrad()) {
+    gammaGrad = weight_->getWGrad()->getData();
+  } else {
+    create(tmpWGrad_, 1, channels_, &gammaGrad);
+  }
+  if (biases_ && biases_->getWGrad()) {
+    betaGrad = biases_->getWGrad()->getData();
+  } else {
+    create(tmpBiasGrad_, 1, channels_, &betaGrad);
+  }
+
+  hl_batch_norm_backward(ioDesc_,
+                         input,
+                         ioDesc_,
+                         outGrad,
+                         ioDesc_,
+                         inGrad,
+                         bnParamDesc_,
+                         gamma,
+                         gammaGrad,
+                         betaGrad,
+                         eps_,
+                         savedMean,
+                         savedInvVar);
+
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    biases_->getParameterPtr()->incUpdate(callback);
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+CudnnBatchNormLayer::~CudnnBatchNormLayer() {
+  hl_destroy_tensor_descriptor(ioDesc_);
+  hl_destroy_tensor_descriptor(bnParamDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
similarity index 100%
rename from paddle/gserver/layers/CudnnBatchNormLayer.h
rename to paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
diff --git a/paddle/gserver/layers/CudnnConvBaseLayer.cpp b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/CudnnConvBaseLayer.cpp
rename to paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
diff --git a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d050183eb7838bed803995985383e0ee4e9731a1
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "Projection.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A 2-dimension conv layer implemented by cuDNN. It only
+ *        supports GPU mode. We automatic select CudnnConvLayer for GPU
+ *        mode and ExpandConvLayer for CPU mode if you set type of "conv".
+ *        User also can specfiy type of "exconv" or "cudnn_conv" for
+ *        particular type.
+ *
+ * The config file api is img_conv_layer.
+ */
+class CudnnConvBaseLayer : public ConvBaseLayer {
+ protected:
+  std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
+  std::vector<std::unique_ptr<Projection>> projections_;
+
+  hl_tensor_descriptor biasDesc_;
+  hl_tensor_descriptor outputDesc_;
+
+ public:
+  explicit CudnnConvBaseLayer(const LayerConfig& config)
+      : ConvBaseLayer(config) {}
+
+  ~CudnnConvBaseLayer();
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp b/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9739ed9da463be7434731a9b035f3ee7cf3fc2bf
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CudnnPoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+bool CudnnPoolLayer::typeCheck(const std::string &poolType,
+                               hl_pooling_mode_t *mode) {
+  if (poolType == "cudnn-max-pool") {
+    if (mode) {
+      *mode = HL_POOLING_MAX;
+    }
+  } else if (poolType == "cudnn-avg-pool") {
+    if (mode) {
+      *mode = HL_POOLING_AVERAGE;
+    }
+  } else if (poolType == "cudnn-avg-incl-pad-pool") {
+    if (mode) {
+      *mode = HL_POOLING_AVERAGE_INCLUDE_PADDING;
+    }
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+CudnnPoolLayer::CudnnPoolLayer(const LayerConfig &config) : PoolLayer(config) {
+  const std::string &pool_type = config.inputs(0).pool_conf().pool_type();
+  CHECK_EQ(CudnnPoolLayer::typeCheck(pool_type, &mode_), true);
+}
+
+bool CudnnPoolLayer::init(const LayerMap &layerMap,
+                          const ParameterMap &parameterMap) {
+  PoolLayer::init(layerMap, parameterMap);
+
+  CHECK(useGpu_) << "CudnnPoolLayer only support gpu";
+
+  hl_create_tensor_descriptor(&inputDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+
+  windowHeight = sizeY_;
+  windowWidth = sizeX_;
+  heightPadding = confPaddingY_;
+  widthPadding = confPadding_;
+  strideHeight = strideY_;
+  strideWidth = stride_;
+
+  hl_create_pooling_descriptor(&poolingDesc_,
+                               mode_,
+                               windowHeight,
+                               windowWidth,
+                               heightPadding,
+                               widthPadding,
+                               strideHeight,
+                               strideWidth);
+
+  return true;
+}
+
+void CudnnPoolLayer::reshape(int batchSize) {
+  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imageH_ == 0) {
+    imageH_ = imgSizeY_;
+  }
+  if (imageW_ == 0) {
+    imageW_ = imgSize_;
+  }
+  CHECK_EQ(inputLayers_[0]->getOutput().value->getWidth(),
+           channels_ * imageH_ * imageW_);
+  outputH_ = outputSize(imageH_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputW_ =
+      outputSize(imageW_, sizeX_, confPadding_, stride_, /* caffeMode */ false);
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+
+  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_);
+  hl_tensor_reshape(outputDesc_, batchSize, channels_, outputH_, outputW_);
+}
+
+void CudnnPoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  CHECK(inputLayers_[0]->getOutputValue()->useGpu());
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  reshape(batchSize);
+  resetOutput(batchSize, outputH_ * outputW_ * channels_);
+
+  real *inputData = getInputValue(0)->getData();
+  real *outData = getOutputValue()->getData();
+  hl_pooling_forward(inputDesc_, inputData, outputDesc_, outData, poolingDesc_);
+}
+
+void CudnnPoolLayer::backward(const UpdateCallback &callback) {
+  (void)callback;
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+
+  real *inputData = getInputValue(0)->getData();
+  real *inputGrad = getInputGrad(0)->getData();
+  real *outData = getOutputValue()->getData();
+  real *outGrad = getOutputGrad()->getData();
+  hl_pooling_backward(inputDesc_,
+                      inputData,
+                      inputGrad,
+                      outputDesc_,
+                      outData,
+                      outGrad,
+                      poolingDesc_);
+}
+
+CudnnPoolLayer::~CudnnPoolLayer() {
+  hl_destroy_tensor_descriptor(inputDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_pooling_descriptor(poolingDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/legacy/gserver/layers/CudnnPoolLayer.h
similarity index 100%
rename from paddle/gserver/layers/CudnnPoolLayer.h
rename to paddle/legacy/gserver/layers/CudnnPoolLayer.h
diff --git a/paddle/gserver/layers/DataLayer.cpp b/paddle/legacy/gserver/layers/DataLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/DataLayer.cpp
rename to paddle/legacy/gserver/layers/DataLayer.cpp
diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/legacy/gserver/layers/DataLayer.h
similarity index 100%
rename from paddle/gserver/layers/DataLayer.h
rename to paddle/legacy/gserver/layers/DataLayer.h
diff --git a/paddle/gserver/layers/DataNormLayer.cpp b/paddle/legacy/gserver/layers/DataNormLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/DataNormLayer.cpp
rename to paddle/legacy/gserver/layers/DataNormLayer.cpp
diff --git a/paddle/legacy/gserver/layers/DataNormLayer.h b/paddle/legacy/gserver/layers/DataNormLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..556d7f4d669095cef0d6506fbba07b7455456b43
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DataNormLayer.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for data normalization
+ * - Input: One and only one input layer is accepted. The input layer must
+ *        be DataLayer with dense data type.
+ * - Output: The normalization of the input data
+ *
+ * Reference:
+ *    LA Shalabi, Z Shaaban, B Kasasbeh. Data mining: A preprocessing engine
+ *
+ * Three data normalization methoeds are considered
+ * - z-score: y = (x-mean)/std
+ * - min-max: y = (x-min)/(max-min)
+ * - decimal-scaling: y = x/10^j, where j is the smallest integer such that
+ *max(|y|)<1
+ */
+
+class DataNormLayer : public Layer {
+ public:
+  enum NormalizationStrategy { kZScore = 0, kMinMax = 1, kDecimalScaling = 2 };
+
+  explicit DataNormLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~DataNormLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  int mode_;
+  std::unique_ptr<Weight> weight_;
+  MatrixPtr min_;
+  MatrixPtr rangeReciprocal_;  // 1/(max-min)
+  MatrixPtr mean_;
+  MatrixPtr stdReciprocal_;      // 1/std
+  MatrixPtr decimalReciprocal_;  // 1/10^j
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DeConv3DLayer.cpp b/paddle/legacy/gserver/layers/DeConv3DLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/DeConv3DLayer.cpp
rename to paddle/legacy/gserver/layers/DeConv3DLayer.cpp
diff --git a/paddle/legacy/gserver/layers/DeConv3DLayer.h b/paddle/legacy/gserver/layers/DeConv3DLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9931bccb1284111e299206883847045edaae4ded
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DeConv3DLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of deconvolution3D layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate deconvolution3D operation.
+ */
+class DeConv3DLayer : public ConvBaseLayer {
+ public:
+  explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  ~DeConv3DLayer() {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void addBias();
+  void backward(const UpdateCallback& callback);
+  void bpropBiases();
+  void bpropData(int i);
+  void bpropWeights(int i);
+  size_t getSize();
+
+ protected:
+  // Figure out the dimensions for individual gemms.
+  IntV M_;  /// numFilters_ / filter_group_;
+  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
+  IntV K_;  /// outputD_ * outputH_ * outputW_
+  IntV NOut_;
+  MatrixPtr colBuf_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionOutputLayer.cpp b/paddle/legacy/gserver/layers/DetectionOutputLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/DetectionOutputLayer.cpp
rename to paddle/legacy/gserver/layers/DetectionOutputLayer.cpp
diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/legacy/gserver/layers/DetectionOutputLayer.h
similarity index 100%
rename from paddle/gserver/layers/DetectionOutputLayer.h
rename to paddle/legacy/gserver/layers/DetectionOutputLayer.h
diff --git a/paddle/gserver/layers/DetectionUtil.cpp b/paddle/legacy/gserver/layers/DetectionUtil.cpp
similarity index 100%
rename from paddle/gserver/layers/DetectionUtil.cpp
rename to paddle/legacy/gserver/layers/DetectionUtil.cpp
diff --git a/paddle/legacy/gserver/layers/DetectionUtil.h b/paddle/legacy/gserver/layers/DetectionUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1e0bb809ad290613159f558e9b1860476b3b5f2
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DetectionUtil.h
@@ -0,0 +1,307 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <float.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/legacy/math/Matrix.h"
+
+using std::vector;
+using std::pair;
+using std::map;
+
+namespace paddle {
+
+template <typename T>
+struct BBoxBase {
+  BBoxBase(T xMin, T yMin, T xMax, T yMax)
+      : xMin(xMin), yMin(yMin), xMax(xMax), yMax(yMax), isDifficult(false) {}
+
+  BBoxBase() {}
+
+  T getWidth() const { return xMax - xMin; }
+
+  T getHeight() const { return yMax - yMin; }
+
+  T getCenterX() const { return (xMin + xMax) / 2; }
+
+  T getCenterY() const { return (yMin + yMax) / 2; }
+
+  T getArea() const { return getWidth() * getHeight(); }
+
+  // coordinate of bounding box
+  T xMin;
+  T yMin;
+  T xMax;
+  T yMax;
+  // whether difficult object (e.g. object with heavy occlusion is difficult)
+  bool isDifficult;
+};
+
+struct NormalizedBBox : BBoxBase<real> {
+  NormalizedBBox() : BBoxBase<real>() {}
+};
+
+enum PermMode { kNCHWToNHWC, kNHWCToNCHW };
+
+/**
+ * @brief First permute input maxtrix then append to output matrix
+ */
+size_t appendWithPermute(const Matrix& inMatrix,
+                         size_t height,
+                         size_t width,
+                         size_t outTotalSize,
+                         size_t outOffset,
+                         size_t batchSize,
+                         Matrix& outMatrix,
+                         PermMode permMode);
+
+/**
+ * @brief First permute input maxtrix then decompose to output
+ */
+size_t decomposeWithPermute(const Matrix& inMatrix,
+                            size_t height,
+                            size_t width,
+                            size_t totalSize,
+                            size_t offset,
+                            size_t batchSize,
+                            Matrix& outMatrix,
+                            PermMode permMode);
+
+/**
+ * @brief Compute jaccard overlap between two bboxes.
+ * @param bbox1 The first bbox
+ * @param bbox2 The second bbox
+ */
+real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
+
+/**
+ * @brief Compute offset parameters between prior bbox and ground truth bbox
+ * and variances of prior bbox are considered
+ * @param priorBBox Input prior bbox
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param gtBBox Groundtruth bbox
+ * @param outVec Output vector
+ */
+void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                       const vector<real>& priorBBoxVar,
+                       const NormalizedBBox& gtBBox,
+                       vector<real>& outVec);
+
+/**
+ * @brief Decode prior bbox with offset parameters
+ * and variances of prior bbox are considered
+ * @param priorBBox Prior bbox to be decoded
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param locPredData Offset parameters
+ */
+NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                                 const vector<real>& priorBBoxVar,
+                                 const vector<real>& locPredData);
+
+/**
+ * @brief Extract bboxes from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromPriorData(const real* priorData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+
+/**
+ * @brief Extract labels, scores and bboxes from detection matrix, the layout is
+ * imageId | label | score | xmin | ymin | xmax | ymax
+ * @param detectData Matrix of detection value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param labelVec Label of bbox
+ * @param scoreVec Score of bbox
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromDetectData(const real* detectData,
+                           const size_t numBBoxes,
+                           vector<real>& labelVec,
+                           vector<real>& scoreVec,
+                           vector<NormalizedBBox>& bboxVec);
+
+/**
+ * @brief Extract variances from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param num Number to be extracted
+ * @param varVec Append to the vector
+ */
+void getBBoxVarFromPriorData(const real* priorData,
+                             const size_t num,
+                             vector<vector<real>>& varVec);
+
+/**
+ * @brief Extract bboxes from label matrix, the layout is
+ * class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...
+ * @param labelData Matrix of label value
+ * @param numBBoxes Number to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromLabelData(const real* labelData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+
+/**
+* @brief Match prior bbox to groundtruth bbox, the strategy is:
+1. Find the most overlaped bbox pair (prior and groundtruth)
+2. For rest of prior bboxes find the most overlaped groundtruth bbox
+* @param priorBBoxes prior bbox
+* @param gtBBoxes groundtruth bbox
+* @param overlapThreshold Low boundary of overlap (judge whether matched)
+* @param matchIndices For each prior bbox, groundtruth bbox index if matched
+otherwise -1
+* @param matchOverlaps For each prior bbox, overap with all groundtruth bboxes
+*/
+void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
+               const vector<NormalizedBBox>& gtBBoxes,
+               real overlapThreshold,
+               vector<int>* matchIndices,
+               vector<real>* matchOverlaps);
+
+/**
+* @brief Generate positive bboxes and negative bboxes,
+|positive bboxes|/|negative bboxes| is negPosRatio
+* @param priorValue Prior value
+* @param numPriorBBoxes Number of prior bbox
+* @param gtValue Groundtruth value
+* @param gtStartPosPtr Since groundtruth value stored as sequence type,
+this parameter indicates start position of each record
+* @param seqNum Number of sequence
+* @param maxConfScore Classification score for prior bbox, used to mine
+negative examples
+* @param batchSize Image number
+* @param overlapThreshold Low boundary of overap
+* @param negOverlapThreshold Upper boundary of overap (judge negative example)
+* @param negPosRatio Control number of negative bboxes
+* @param matchIndicesVecPtr Save indices of matched prior bbox
+* @param negIndicesVecPtr Save indices of negative prior bbox
+*/
+pair<size_t, size_t> generateMatchIndices(
+    const Matrix& priorValue,
+    const size_t numPriorBBoxes,
+    const Matrix& gtValue,
+    const int* gtStartPosPtr,
+    const size_t seqNum,
+    const vector<vector<real>>& maxConfScore,
+    const size_t batchSize,
+    const real overlapThreshold,
+    const real negOverlapThreshold,
+    const size_t negPosRatio,
+    vector<vector<int>>* matchIndicesVecPtr,
+    vector<vector<int>>* negIndicesVecPtr);
+
+/**
+ * @brief Get max confidence score for each prior bbox
+ * @param confData Confidence scores, layout is
+ * class1 score | class2 score | ... | classN score ...
+ * @param batchSize Image number
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Classes number
+ * @param backgroundId Background id
+ * @param maxConfScoreVecPtr Ouput
+ */
+void getMaxConfidenceScores(const real* confData,
+                            const size_t batchSize,
+                            const size_t numPriorBBoxes,
+                            const size_t numClasses,
+                            const size_t backgroundId,
+                            vector<vector<real>>* maxConfScoreVecPtr);
+
+template <typename T>
+bool sortScorePairDescend(const pair<real, T>& pair1,
+                          const pair<real, T>& pair2);
+
+template <>
+bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
+                          const pair<real, NormalizedBBox>& pair2);
+
+/**
+ * @brief Do NMS for bboxes to remove duplicated bboxes
+ * @param bboxes BBoxes to apply NMS
+ * @param confScoreData Confidence scores
+ * @param classIdx Class to do NMS
+ * @param topK Number to keep
+ * @param confThreshold Low boundary of confidence score
+ * @param nmsThreshold Threshold of overlap
+ * @param numPriorBBoxes Total number of prior bboxes
+ * @param numClasses Total class number
+ * @param indices Indices of high quality bboxes
+ */
+void applyNMSFast(const vector<NormalizedBBox>& bboxes,
+                  const real* confScoreData,
+                  size_t classIdx,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  size_t numPriorBBoxes,
+                  size_t numClasses,
+                  vector<size_t>* indices);
+
+/**
+ * @brief Get detection results which satify requirements
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param backgroundId Background class
+ * @param batchSize Image number
+ * @param confThreshold Threshold of class confidence
+ * @param nmsTopK Used in NMS operation to keep top k bbox
+ * @param nmsThreshold Used in NMS, threshold of overlap
+ * @param keepTopK How many bboxes keeped in an image
+ * @param allDecodedBBoxes Decoded bboxes for all images
+ * @param allDetectionIndices Save detection bbox indices
+ */
+size_t getDetectionIndices(
+    const real* confData,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t backgroundId,
+    const size_t batchSize,
+    const real confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+    vector<map<size_t, vector<size_t>>>* allDetectionIndices);
+
+/**
+ * @brief Get detection results
+ * @param confData Confidence scores
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param batchSize Image number
+ * @param allIndices Indices of predicted bboxes
+ * @param allDecodedBBoxes BBoxes decoded
+ * @param out Output matrix
+ * image number | label | confidence score | xMin | yMin | xMax | yMax
+ */
+void getDetectionOutput(const real* confData,
+                        const size_t numKept,
+                        const size_t numPriorBBoxes,
+                        const size_t numClasses,
+                        const size_t batchSize,
+                        const vector<map<size_t, vector<size_t>>>& allIndices,
+                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+                        Matrix& out);
+
+NormalizedBBox clipBBox(const NormalizedBBox& bbox);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DotMulOperator.cpp b/paddle/legacy/gserver/layers/DotMulOperator.cpp
similarity index 100%
rename from paddle/gserver/layers/DotMulOperator.cpp
rename to paddle/legacy/gserver/layers/DotMulOperator.cpp
diff --git a/paddle/gserver/layers/DotMulProjection.cpp b/paddle/legacy/gserver/layers/DotMulProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/DotMulProjection.cpp
rename to paddle/legacy/gserver/layers/DotMulProjection.cpp
diff --git a/paddle/legacy/gserver/layers/DotProdLayer.cpp b/paddle/legacy/gserver/layers/DotProdLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..445361b10176a160609b181cb2fdc3756921c423
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DotProdLayer.cpp
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for computing the dot product of two vectors.
+ * Input1: vector (batchSize * dim)
+ * Input2: vector (batchSize * dim)
+ * Output: a matrix: (batchSize * 1)
+ */
+
+class DotProdLayer : public Layer {
+ public:
+  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~DotProdLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(dot_prod, DotProdLayer);
+
+bool DotProdLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+  CHECK_EQ(1UL, getSize())
+      << "The output dimensionality of this layer should be fixed to 1.";
+
+  return true;
+}
+
+void DotProdLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  CHECK_EQ(inV1->getHeight(), batchSize);
+  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, 1);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
+    outV->sumOfProducts(*inV0, *inV1, 1, 0);
+  }
+}
+
+void DotProdLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  {
+    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
+
+    if (inG0) {
+      inG0->addRowScale(0, *inV1, *outG);
+    }
+
+    if (inG1) {
+      inG1->addRowScale(0, *inV0, *outG);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/EosIdCheckLayer.cpp
rename to paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/legacy/gserver/layers/ExpandConvLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/ExpandConvLayer.cpp
rename to paddle/legacy/gserver/layers/ExpandConvLayer.cpp
diff --git a/paddle/legacy/gserver/layers/ExpandConvLayer.h b/paddle/legacy/gserver/layers/ExpandConvLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0eff3ab061949bd583e0deaf121912ed993be76
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ExpandConvLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of convolution layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate convolution operation.
+ *
+ * The config file api is img_conv_layer.
+ */
+
+class ExpandConvLayer : public ConvBaseLayer {
+ public:
+  explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+
+  ~ExpandConvLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+  size_t getOutputSize();
+
+ protected:
+  std::vector<TensorShape> inputShape_;
+  std::vector<TensorShape> filterShape_;
+  std::vector<TensorShape> outputShape_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandLayer.cpp b/paddle/legacy/gserver/layers/ExpandLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/ExpandLayer.cpp
rename to paddle/legacy/gserver/layers/ExpandLayer.cpp
diff --git a/paddle/legacy/gserver/layers/ExpandLayer.h b/paddle/legacy/gserver/layers/ExpandLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..75a1ec75688cdbc61a117da7d4be47848c30425a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ExpandLayer.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * A layer for "Expand Dense data or (sequence data where the length of each
+ * sequence is one) to sequence data."
+ *
+ * It should have exactly 2 input, one for data, one for size:
+ * - first one for data
+ *   - If ExpandLevel = kNonSeq: dense data
+ *   - If ExpandLevel = kSeq: sequence data where the length of each sequence is
+ * one
+ * - second one only for sequence info
+ *   - should be sequence data with or without sub-sequence.
+ *
+ * And the output size is the batch size(not instances) of second input.
+ *
+ * The config file api is expand_layer.
+ */
+
+class ExpandLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+  /// if input[0] is dense data, ExpandLevel=kNonSeq;
+  /// if input[0] is sequence data, ExpandLevel=kSeq
+  enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
+  /// store the ExpandLevel
+  int type_;
+  /// expanded sequenceStartPositions or subSequenceStartPositions
+  /// of input[1]
+  ICpuGpuVectorPtr expandStartsPos_;
+
+ public:
+  explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ExpandLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp b/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ddd202e1c6d20e25ec77dc881965a47092b10e42
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FactorizationMachineLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  factorSize_ = config_.factor_size();
+
+  /* initialize the latentVectors_ */
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));
+
+  return true;
+}
+
+void FactorizationMachineLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const MatrixPtr& inputV = getInputValue(0);
+
+  size_t batchSize = inputV->getHeight();
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+
+  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
+  outV->sumRows(*tmpOut_, 0.5, 0);
+
+  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
+                                       inputV->getHeight(),
+                                       inputV->getWidth(),
+                                       inputV->getElementCnt(),
+                                       inputV->getValueType());
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
+  } else {
+    Matrix::resizeOrCreate(
+        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+    inputV->square2(*inputSquare_);
+  }
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
+  outV->sumRows(*tmpOut_, -0.5, 1.0);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  const MatrixPtr& inputV = getInputValue(0);
+  const MatrixPtr& oGrad = getOutputGrad();
+
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+                                         latentVectors_->getW()->getHeight(),
+                                         1,
+                                         false,
+                                         useGpu_);
+
+  /* Calculate the gradients of the latentVectors_ matrix */
+  if (latentVectors_->getWGrad()) {
+    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
+                                         inputV->getHeight(),
+                                         inputV->getWidth(),
+                                         inputV->getElementCnt());
+
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
+    } else {
+      Matrix::resizeOrCreate(
+          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+
+      tmpInput_->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
+
+      tmpSum_->sumCols(*tmpInput_, -1, 0);
+    }
+
+    latentVectors_->getWGrad()->addRowScale(
+        0, *latentVectors_->getW(), *tmpSumTrans);
+
+    /* Increasing the number of gradient */
+    latentVectors_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers gradient */
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad != NULL) {
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
+    inGrad->rowScale(0, *inGrad, *oGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FactorizationMachineLayer.h b/paddle/legacy/gserver/layers/FactorizationMachineLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1070ebd0971e18d090cd7c46cf38a016522db5b8
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FactorizationMachineLayer.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Factorization machines.
+ *
+ * The config file api is factorization_machine.
+ */
+
+class FactorizationMachineLayer : public Layer {
+ protected:
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
+  std::unique_ptr<Weight> latentVectors_;
+  // The hyperparameter that defines the dimensionality of the factorization
+  size_t factorSize_;
+
+ private:
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Store temporary calculation result
+  MatrixPtr tmpOut_;
+  MatrixPtr tmpSum_;
+  MatrixPtr tmpInput_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;
+
+ public:
+  explicit FactorizationMachineLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FactorizationMachineLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..417756a286d9538d9bd17a41b744f3f9ac820ae3
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for expanding a batch of images to feature maps.
+ * Each data of the input is a 2 dimensional matrix. Each element of the matrix
+ * is replicated num_filters times to create a feature map with num_filters
+ * channels.
+ * - Input: Input one should be dense image data.
+ * - Output: expanded fature maps.
+ * \f[
+ *  y.row[i] = x.row[i \mod x.width], i = 0,1,..., (x.width * num\_filters - 1)
+ * \f]
+ * For example, num_filters = 4:
+ * @code
+ *   x = [a1,a2;
+ *        b1,b2]
+ *   y = [a1, a2, a1, a2, a1, a2, a1, a2;
+ *        b1, b2, b1, b2, b1, b2, b1, b2;]
+ * @endcode
+ */
+
+class FeatureMapExpandLayer : public Layer {
+ private:
+  int numFilters_;
+  bool asRowVector_;
+
+ public:
+  explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~FeatureMapExpandLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(featmap_expand, FeatureMapExpandLayer);
+
+bool FeatureMapExpandLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  numFilters_ = config_.num_filters();
+  asRowVector_ = config_.user_arg() != "as_col_vec";
+  return true;
+}
+
+void FeatureMapExpandLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inputV = getInputValue(0);
+  size_t batchSize = getInput(0).getBatchSize();
+  int imgSize = inputV->getWidth();
+  resetOutput(batchSize, imgSize * numFilters_);
+
+  MatrixPtr outputV = getOutputValue();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    if (asRowVector_) {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outVTmp =
+            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
+                           numFilters_,
+                           imgSize,
+                           false,
+                           useGpu_);
+        MatrixPtr inVTmp = Matrix::create(
+            inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
+        outVTmp->addRowVector(*inVTmp);
+      }
+    } else {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outVTmp =
+            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
+                           imgSize,
+                           numFilters_,
+                           false,
+                           useGpu_);
+        MatrixPtr inVTmp = Matrix::create(
+            inputV->getData() + i * imgSize, imgSize, 1, false, useGpu_);
+        outVTmp->addColVector(*inVTmp);
+      }
+    }
+  }
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inGrad = getInputGrad(0);
+  if (NULL == inGrad) {
+    return;
+  }
+  MatrixPtr outGrad = getOutputGrad();
+  size_t batchSize = getInput(0).getBatchSize();
+  int imgSize = inGrad->getWidth();
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    if (asRowVector_) {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outGradTmp =
+            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
+                           numFilters_,
+                           imgSize,
+                           false,
+                           useGpu_);
+        MatrixPtr inGradTmp = Matrix::create(
+            inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
+        inGradTmp->collectBias(*outGradTmp, 1);
+      }
+    } else {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outGradTmp =
+            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
+                           imgSize,
+                           numFilters_,
+                           false,
+                           useGpu_);
+        MatrixPtr inGradTmp = Matrix::create(
+            inGrad->getData() + i * imgSize, imgSize, 1, false, useGpu_);
+        inGradTmp->sumRows(*outGradTmp, 1, 1);
+      }
+    }
+  }
+}
+
+}  // namespace paddle.
diff --git a/paddle/gserver/layers/FullMatrixProjection.cpp b/paddle/legacy/gserver/layers/FullMatrixProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/FullMatrixProjection.cpp
rename to paddle/legacy/gserver/layers/FullMatrixProjection.cpp
diff --git a/paddle/gserver/layers/FullMatrixProjection.h b/paddle/legacy/gserver/layers/FullMatrixProjection.h
similarity index 100%
rename from paddle/gserver/layers/FullMatrixProjection.h
rename to paddle/legacy/gserver/layers/FullMatrixProjection.h
diff --git a/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp b/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ffb4876f8b0a45abf5588201482d75cd9222fbd
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FullyConnectedLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(fc, FullyConnectedLayer);
+
+bool FullyConnectedLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize the weightList */
+  CHECK(inputLayers_.size() == parameters_.size());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    // Option the parameters
+    size_t height = inputLayers_[i]->getSize();
+    size_t width = getSize();
+
+    // create a new weight
+    if (parameters_[i]->isSparse()) {
+      CHECK_LE(parameters_[i]->getSize(), width * height);
+    } else {
+      CHECK_EQ(parameters_[i]->getSize(), width * height);
+    }
+    Weight* w = new Weight(height, width, parameters_[i]);
+
+    // append the new weight to the list
+    weights_.emplace_back(w);
+  }
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  return true;
+}
+
+void FullyConnectedLayer::prefetch() {
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    auto* sparseParam =
+        dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+    if (sparseParam) {
+      MatrixPtr input = getInputValue(i);
+      sparseParam->addRows(input);
+    }
+  }
+}
+
+void FullyConnectedLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    auto input = getInput(i);
+    CHECK(input.value) << "The input of 'fc' layer must be matrix";
+    REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
+    i == 0 ? outV->mul(*input.value, *weights_[i]->getW(), 1, 0)
+           : outV->mul(*input.value, *weights_[i]->getW(), 1, 1);
+  }
+
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FullyConnectedLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  bool syncFlag = hl_get_sync_flag();
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    /* Calculate the W-gradient for the current layer */
+    if (weights_[i]->getWGrad()) {
+      MatrixPtr input_T = getInputValue(i)->getTranspose();
+      MatrixPtr oGrad = getOutputGrad();
+      {
+        REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
+        weights_[i]->getWGrad()->mul(*input_T, *oGrad, 1, 1);
+      }
+    }
+
+    // If callback does not change value, backprop error asynchronously so that
+    // we can do the callback concurrently.
+    hl_set_sync_flag(false);
+
+    /* Calculate the input layers error */
+    MatrixPtr preGrad = getInputGrad(i);
+    if (NULL != preGrad) {
+      MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
+      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
+      preGrad->mul(*getOutputGrad(), *weights_T, 1, 1);
+    }
+
+    hl_set_sync_flag(syncFlag);
+    {
+      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FullyConnectedLayer.h b/paddle/legacy/gserver/layers/FullyConnectedLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8a1c54e55fd4710c62ee8b91720755e0af80ff5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FullyConnectedLayer.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * A layer has full connections to all neurons in the previous layer.
+ * It computes an inner product with a set of learned weights, and
+ * (optionally) adds biases.
+ *
+ * The config file api is fc_layer.
+ */
+
+class FullyConnectedLayer : public Layer {
+ protected:
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+
+ public:
+  explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
+  ~FullyConnectedLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+
+  void prefetch() override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.cpp b/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/GatedRecurrentLayer.cpp
rename to paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
diff --git a/paddle/legacy/gserver/layers/GatedRecurrentLayer.h b/paddle/legacy/gserver/layers/GatedRecurrentLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bbf01ce200c9922f49508b0499aa9422745f474
--- /dev/null
+++ b/paddle/legacy/gserver/layers/GatedRecurrentLayer.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "GruCompute.h"
+#include "Layer.h"
+#include "SequenceToBatch.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Please refer to "Junyoung Chung, Empirical Evaluation
+ * of Gated Recurrent Neural Networks on Sequence Modeling".
+ *
+ * GatedRecurrentLayer takes 1 input layer with size * 3.
+ * Input layer is diveded into 3 equal parts: (xz_t, xr_t, xi_t).
+ * parameter and biasParameter is also diveded into 3 equal parts:
+ *   - parameter consists of (U_z, U_r, U)
+ *   - baisParameter consists of (bias_z, bias_r, bias_o)
+ *
+ * \f[
+ * update \ gate: z_t = actGate(xz_t + U_z * h_{t-1} + bias_z) \\
+ * reset \ gate: r_t = actGate(xr_t + U_r * h_{t-1} + bias_r) \\
+ * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, h_{t-1}) + bias_o) \\
+ * hidden \ activation: h_t = dot((1-z_t), h_{t-1}) + dot(z_t, {h}_t) \\
+ * \f]
+ *
+ * @note
+ * - dot denotes "element-wise multiplication".
+ * - actNode is defined by config active_type
+ * - actGate is defined by config actvie_gate_type
+ *
+ * The config file is grumemory.
+ */
+
+class GatedRecurrentLayer : public Layer, public GruCompute {
+ public:
+  explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+  void resetState() override;
+
+  void setState(LayerStatePtr state) override;
+
+  LayerStatePtr getState() override;
+
+ protected:
+  void forwardSequence(int batchSize,
+                       size_t numSequences,
+                       const int* starts,
+                       MatrixPtr inputValue);
+  void backwardSequence(int batchSize,
+                        size_t numSequences,
+                        const int* starts,
+                        MatrixPtr inputGrad);
+
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int* starts,
+                    MatrixPtr inputValue);
+  void backwardBatch(int batchSize, MatrixPtr inputGrad);
+
+ protected:
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> gateWeight_;
+  std::unique_ptr<Weight> stateWeight_;
+  std::unique_ptr<Weight> bias_;
+
+  Argument gate_;
+  Argument resetOutput_;
+
+  bool reversed_;
+  bool useBatch_;
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+  std::unique_ptr<ActivationFunction> activationGate_;
+
+  MatrixPtr prevOutput_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/legacy/gserver/layers/GetOutputLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/GetOutputLayer.cpp
rename to paddle/legacy/gserver/layers/GetOutputLayer.cpp
diff --git a/paddle/legacy/gserver/layers/GruCompute.cpp b/paddle/legacy/gserver/layers/GruCompute.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d50c959e4386ece16b935835ee7d6d717b844e64
--- /dev/null
+++ b/paddle/legacy/gserver/layers/GruCompute.cpp
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GruCompute.h"
+#include "hl_recurrent_apply.cuh"
+#include "paddle/legacy/function/GruFunctor.h"
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+
+void GruCompute::init(LayerConfig &config) {
+  activeNode_ = hlActiveType(config.active_type());
+  activeGate_ = hlActiveType(config.active_gate_type());
+}
+
+template <>
+void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
+  GruFunctor<DEVICE_TYPE_CPU, real>::compute(hppl::forward::gru_resetOutput(),
+                                             hppl::forward::gru_finalOutput(),
+                                             value,
+                                             frameSize,
+                                             batchSize,
+                                             activeNode_,
+                                             activeGate_);
+}
+
+template <>
+void GruCompute::backward<0>(hl_gru_value value,
+                             hl_gru_grad grad,
+                             int frameSize,
+                             int batchSize) {
+  GruGradFunctor<DEVICE_TYPE_CPU, real>::compute(
+      hppl::backward::gru_stateGrad(),
+      hppl::backward::gru_resetGrad(),
+      value,
+      grad,
+      frameSize,
+      batchSize,
+      activeNode_,
+      activeGate_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.cu b/paddle/legacy/gserver/layers/GruCompute.cu
similarity index 100%
rename from paddle/gserver/layers/GruCompute.cu
rename to paddle/legacy/gserver/layers/GruCompute.cu
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/legacy/gserver/layers/GruCompute.h
similarity index 100%
rename from paddle/gserver/layers/GruCompute.h
rename to paddle/legacy/gserver/layers/GruCompute.h
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/legacy/gserver/layers/GruStepLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/GruStepLayer.cpp
rename to paddle/legacy/gserver/layers/GruStepLayer.cpp
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
rename to paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h
similarity index 100%
rename from paddle/gserver/layers/HierarchicalSigmoidLayer.h
rename to paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h
diff --git a/paddle/gserver/layers/IdentityProjection.cpp b/paddle/legacy/gserver/layers/IdentityProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/IdentityProjection.cpp
rename to paddle/legacy/gserver/layers/IdentityProjection.cpp
diff --git a/paddle/legacy/gserver/layers/InterpolationLayer.cpp b/paddle/legacy/gserver/layers/InterpolationLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aabfdc55ba4dc44bf7f487cb30be32a376175729
--- /dev/null
+++ b/paddle/legacy/gserver/layers/InterpolationLayer.cpp
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for linear interpolation with two inputs,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
+ * \f]
+ * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs,
+ * \f$w\f$ is (batchSize x 1) weight vector,
+ * and \f$y\f$ is (batchSize x dataDim) output.
+ *
+ * The config file api is interpolation_layer.
+ */
+
+class InterpolationLayer : public Layer {
+ protected:
+  /// weightLast = 1 - weight
+  MatrixPtr weightLast_;
+  MatrixPtr tmpMatrix;
+
+ public:
+  explicit InterpolationLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~InterpolationLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(interpolation, InterpolationLayer);
+
+bool InterpolationLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(3U, inputLayers_.size());
+
+  return true;
+}
+
+void InterpolationLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inV2 = getInputValue(2);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  CHECK_EQ(dataDim, getSize());
+  CHECK_EQ(dataDim, inV2->getWidth());
+  CHECK_EQ(batchSize, inV1->getHeight());
+  CHECK_EQ(batchSize, inV2->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(weightLast_, batchSize, 1, false, useGpu_);
+  weightLast_->one();
+  weightLast_->sub(*weightV);
+
+  REGISTER_TIMER_INFO("FwInterpTimer", getName().c_str());
+  // outV = inV1 * weight + inV2 * weightLast
+  outV->addRowScale(0, *inV1, *weightV);
+  outV->addRowScale(0, *inV2, *weightLast_);
+}
+
+void InterpolationLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inV2 = getInputValue(2);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr inG2 = getInputGrad(2);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  REGISTER_TIMER_INFO("BwInterpTimer", getName().c_str());
+
+  if (inG0) {
+    Matrix::resizeOrCreate(tmpMatrix, batchSize, dataDim, false, useGpu_);
+
+    // inG0 += outG .* (inV1 - inV2)
+    tmpMatrix->sub(*inV1, *inV2);
+    inG0->rowDotMul(0, *outG, *tmpMatrix);
+  }
+
+  if (inG1) {
+    // inG1 += outG * weight
+    inG1->addRowScale(0, *outG, *weightV);
+  }
+
+  if (inG2) {
+    // inG2 += outG * weightLast
+    inG2->addRowScale(0, *outG, *weightLast_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp b/paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/KmaxSeqScoreLayer.cpp
rename to paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp
diff --git a/paddle/gserver/layers/L2DistanceLayer.cpp b/paddle/legacy/gserver/layers/L2DistanceLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/L2DistanceLayer.cpp
rename to paddle/legacy/gserver/layers/L2DistanceLayer.cpp
diff --git a/paddle/legacy/gserver/layers/L2DistanceLayer.h b/paddle/legacy/gserver/layers/L2DistanceLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa8aabd9ca5702e3ebdccbe7bb4f98fa087dd238
--- /dev/null
+++ b/paddle/legacy/gserver/layers/L2DistanceLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief The layer calculates the l2 distance between two input vectors.
+ * \f[
+ * f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
+ * \f]
+ *
+ * - Input1: A vector (batchSize * dataDim)
+ * - Input2: A vector (batchSize * dataDim)
+ * - Output: A vector (batchSize * 1)
+ *
+ * The configuration api is: l2_distance_layer.
+ */
+
+class L2DistanceLayer : public Layer {
+ public:
+  explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
+  ~L2DistanceLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  // Store the result of subtracting Input2 from Input1 in forward computation,
+  // which will be reused in backward computation.
+  MatrixPtr inputSub_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Layer.cpp b/paddle/legacy/gserver/layers/Layer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f580b8e6977d5c757c118e7945d6dc8d88b3c927
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Layer.cpp
@@ -0,0 +1,410 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Util.h"
+
+#include "CostLayer.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/utils/Error.h"
+#include "paddle/utils/Logging.h"
+
+#ifndef PADDLE_MOBILE_INFERENCE
+#include "ValidationLayer.h"
+#endif
+
+DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
+
+namespace paddle {
+
+Layer::Layer(const LayerConfig& config, bool useGpu)
+    : config_(config),
+      useGpu_(useGpu),
+      deviceId_(CPU_DEVICE),
+      needSequenceInfo_(true) {}
+
+bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  if (useGpu_ && FLAGS_parallel_nn) {
+    /* gpu environment is specified by device property */
+    deviceId_ = config_.device();
+    if (deviceId_ < 0) {
+      useGpu_ = false;
+    }
+  }
+
+  output_.deviceId = deviceId_;
+
+  for (auto& inputConfig : config_.inputs()) {
+    std::string inputName = inputConfig.input_layer_name();
+    LayerPtr inputLayer;
+    CHECK(mapGet(inputName, layerMap, &inputLayer))
+        << "Cannot find input layer " << inputName << " for layer "
+        << getName();
+    this->addPrev(inputLayer);
+
+    inputLayer->addOutputArgument(deviceId_);
+
+    if (inputConfig.has_input_parameter_name()) {
+      ParameterPtr parameter;
+      CHECK(
+          mapGet(inputConfig.input_parameter_name(), parameterMap, &parameter))
+          << "Cannot find input parameter "
+          << inputConfig.input_parameter_name() << " for layer " << getName();
+      parameter->incShared();
+      CHECK_EQ(parameter->getDeviceId(), getDeviceId());
+      parameters_.push_back(parameter);
+    } else {
+      parameters_.push_back(nullptr);
+    }
+
+    if (inputConfig.has_input_layer_argument()) {
+      inputArgument_.push_back(inputConfig.input_layer_argument());
+    } else {
+      inputArgument_.push_back("");
+    }
+  }
+
+  if (config_.has_bias_parameter_name()) {
+    CHECK(mapGet(config_.bias_parameter_name(), parameterMap, &biasParameter_))
+        << "Cannot find bias parameter " << config_.bias_parameter_name()
+        << " for layer " << getName();
+    biasParameter_->incShared();
+    CHECK_EQ(biasParameter_->getDeviceId(), getDeviceId());
+  }
+
+  /* specify the activation function according to the configuration */
+  std::string action_type = config_.active_type();
+  activation_.reset(ActivationFunction::create(action_type));
+  CHECK(activation_);
+
+  initNeedFlags();
+  markInBackward_.assign(inputLayers_.size(), false);
+
+  return true;
+}
+
+ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
+
+LayerPtr Layer::create(const LayerConfig& config) {
+  std::string type = config.type();
+
+#ifndef PADDLE_MOBILE_INFERENCE
+  // NOTE: As following types have illegal character '-',
+  // they can not use REGISTER_LAYER to registrar.
+  // Besides, to fit with old training models,
+  // they can not use '_' instead.
+  if (type == "multi-class-cross-entropy")
+    return LayerPtr(new MultiClassCrossEntropy(config));
+  else if (type == "rank-cost")
+    return LayerPtr(new RankingCost(config));
+  else if (type == "auc-validation")
+    return LayerPtr(new AucValidation(config));
+  else if (type == "pnpair-validation")
+    return LayerPtr(new PnpairValidation(config));
+#endif
+
+  return LayerPtr(registrar_.createByType(config.type(), config));
+}
+
+void Layer::resetSpecifyOutput(Argument& output,
+                               size_t height,
+                               size_t width,
+                               bool isValueClean,
+                               bool isGradClean) {
+  SetDevice device(output.deviceId);
+
+  Matrix::resizeOrCreate(
+      output.value, height, width, /* trans */ false, useGpu(output.deviceId));
+  if (isValueClean) {
+    output.value->zeroMem();
+  }
+
+  if (passType_ != PASS_TEST && needGradient()) {
+    Matrix::resizeOrCreate(
+        output.grad, height, width, /* trans */ false, useGpu(output.deviceId));
+    if (isGradClean) {
+      output.grad->zeroMem();
+    }
+  }
+}
+
+void Layer::resizeOutput(size_t height, size_t width) {
+  resetSpecifyOutput(output_, height, width, false, false);
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, false);
+  }
+}
+
+void Layer::reserveOutput(size_t height, size_t width) {
+  resetSpecifyOutput(output_, height, width, false, true);
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, true);
+  }
+}
+
+void Layer::resetOutput(size_t height, size_t width) {
+  resetSpecifyOutput(output_, height, width, true, true);
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    resetSpecifyOutput(outputOtherDevice_[i], height, width, true, true);
+  }
+}
+
+void Layer::addOutputArgument(int deviceId) {
+  if (deviceId == deviceId_) {
+    output_.countIncrement();
+    return;
+  } else {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].deviceId == deviceId) {
+        outputOtherDevice_[i].countIncrement();
+        return;
+      }
+    }
+  }
+
+  Argument argu;
+  argu.deviceId = deviceId;
+  outputOtherDevice_.push_back(argu);
+  outputOtherDevice_.back().countIncrement();
+}
+
+void Layer::copyOutputToOtherDevice() {
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    SetDevice device(outputOtherDevice_[i].deviceId);
+    // If outputOtherDevice_[i].value is a CpuMatrix,
+    // the copyFrom is a synchronous interface.
+    // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent
+    // calculations are all on HPPL_STREAM_DEFAULT,
+    // copyFrom can be an asynchronous interface.
+    outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
+                                          HPPL_STREAM_DEFAULT);
+    outputOtherDevice_[i].sequenceStartPositions =
+        output_.sequenceStartPositions;
+    outputOtherDevice_[i].subSequenceStartPositions =
+        output_.subSequenceStartPositions;
+    outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
+
+    outputOtherDevice_[i].notifyValueReady();
+  }
+}
+
+void Layer::waitInputValue() {
+  for (size_t i = 0; i != inputLayers_.size(); i++) {
+    if (inputLayers_[i]->getDeviceId() != deviceId_) {
+      getInput(i).waitValueReady();
+    }
+  }
+}
+
+void Layer::waitAndMergeOutputGrad() {
+  if (!output_.grad || !outputOtherDevice_.size()) {
+    return;
+  }
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    outputOtherDevice_[i].waitGradReady();
+  }
+
+  /* merge output grad */
+  size_t i = 0;
+  if (!output_.getAllCount()) {
+    output_.grad->copyFrom(*outputOtherDevice_[0].grad, HPPL_STREAM_1);
+    hl_stream_synchronize(HPPL_STREAM_1);
+
+    i++;
+    if (outputOtherDevice_.size() == 1) return;
+  }
+
+  Matrix::resizeOrCreate(tmpGrad_,
+                         output_.grad->getHeight(),
+                         output_.grad->getWidth(),
+                         /* trans */ false,
+                         useGpu(output_.deviceId));
+
+  for (; i != outputOtherDevice_.size(); i++) {
+    tmpGrad_->copyFrom(*outputOtherDevice_[i].grad, HPPL_STREAM_1);
+    hl_stream_synchronize(HPPL_STREAM_1);
+    output_.grad->add(*tmpGrad_);
+  }
+}
+
+void Layer::markAllInputGrad() {
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (!markInBackward_[i]) {
+      inputLayers_[i]->getOutput(deviceId_).notifyGradReady();
+    }
+    markInBackward_[i] = false;
+  }
+}
+
+void Layer::markInputGrad(int inputIndex) {
+  inputLayers_[inputIndex]->getOutput(deviceId_).notifyGradReady();
+  markInBackward_[inputIndex] = true;
+}
+
+void Layer::zeroGrad() {
+  CHECK(output_.grad.get() != NULL);
+  output_.grad->zeroMem();
+}
+
+void Layer::initNeedFlags() {
+  auto initFlag = [this](
+      bool& flag, bool (Layer::*flagQueryFunc)() const, ParameterType type) {
+    flag = false;
+    if (biasParameter_ && biasParameter_->hasType(type)) {
+      flag = true;
+    }
+    if (!flag) {
+      for (auto& para : parameters_) {
+        if (para && para->hasType(type)) {
+          flag = true;
+          break;
+        }
+      }
+    }
+    if (!flag) {
+      for (auto& layer : inputLayers_) {
+        if ((layer.get()->*flagQueryFunc)()) {
+          flag = true;
+        }
+      }
+    }
+  };
+  initFlag(needGradient_, &Layer::needGradient, PARAMETER_GRADIENT);
+}
+
+void Layer::showOutputStats() {
+  MatrixPtr out = getOutputValue();
+  if (!out) return;
+  if (!out->getElementCnt()) {
+    LOG(INFO) << "The number of output of " << config_.name()
+              << " is 0, skip to show the statistics";
+    return;
+  }
+  MatrixPtr outSquare;
+  if (dynamic_cast<GpuSparseMatrix*>(out.get())) {
+    GpuSparseMatrix* tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
+    outSquare = std::make_shared<CpuSparseMatrix>(tmp->getHeight(),
+                                                  tmp->getWidth(),
+                                                  tmp->getElementCnt(),
+                                                  tmp->getValueType(),
+                                                  tmp->getFormat());
+  } else {
+    outSquare = out->clone();
+  }
+  outSquare->copyFrom(*out, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+
+  real mean = outSquare->getSum() / out->getElementCnt();
+  real min;
+  real max;
+  if (dynamic_cast<CpuSparseMatrix*>(outSquare.get())) {
+    auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
+    min = tmpMat->getMin();
+    max = tmpMat->getMax();
+    tmpMat->square2();
+    LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
+  } else {
+    min = outSquare->getMin();
+    max = outSquare->getMax();
+    outSquare->square2();
+  }
+  real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
+  std = std > 0 ? std : 0;
+  LOG(INFO) << "The output state of " << config_.name() << ": mean=" << mean
+            << ", "
+            << "std=" << std << ", "
+            << "min=" << min << ", "
+            << "max=" << max;
+}
+
+void Layer::forwardActivation() {
+  /* activation */
+  auto status = activation_->forward(output_);
+  status.check();
+
+  /* dropout */
+  if (config_.drop_rate() > 0) {
+    forwardDropOut();
+    CHECK_NE(activation_->getName(), "softmax")
+        << "Softmax activation cannot be used with Dropout";
+  }
+
+  if (FLAGS_show_layer_stat) {
+    showOutputStats();
+  }
+}
+
+void Layer::backwardActivation() {
+  /* Do error clipping */
+  if (config_.error_clipping_threshold() > 0.0f) {
+    if (FLAGS_log_error_clipping) {
+      VectorPtr outGradVec = Vector::create(
+          output_.grad->getData(), output_.grad->getElementCnt(), useGpu_);
+      real maxAbsGrad = outGradVec->getAbsMax();
+      if (maxAbsGrad > config_.error_clipping_threshold()) {
+        real avgAbsGrad = outGradVec->getAbsSum() / outGradVec->getSize();
+        LOG(INFO) << " layer=" << config_.name() << " need clipping,"
+                  << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
+      }
+    }
+    output_.grad->clip(-config_.error_clipping_threshold(),
+                       config_.error_clipping_threshold());
+  }
+
+  /* Do dropout for delta*/
+  if (config_.drop_rate() > 0 && passType_ != PASS_TEST) {
+    MatrixPtr oGrad = getOutputGrad();
+    oGrad->dotMul(*oGrad, *dropOutMask_);
+  }
+
+  auto status = activation_->backward(output_);
+  status.check();
+}
+
+void Layer::forwardDropOut() {
+  auto& outV = getOutputValue();
+
+  if (passType_ == PASS_TRAIN) {
+    // new dropOutMask_ if dropOutMask_ is null ptr
+    Matrix::resizeOrCreate(dropOutMask_,
+                           outV->getHeight(),
+                           outV->getWidth(),
+                           false,
+                           useGpu(deviceId_));
+    dropOutMask_->randomizeUniform();  // generate a uniform random matrix
+    dropOutMask_->biggerThanScalar(config_.drop_rate());  // random mask
+    outV->dotMul(*outV, *dropOutMask_);                   // dropout
+  } else if (passType_ == PASS_GC) {
+    // only initialize once
+    if (!dropOutMask_) {
+      dropOutMask_ = Matrix::create(
+          outV->getHeight(), outV->getWidth(), false, useGpu(deviceId_));
+      // We use cpu matrix to generate mask so that the mask
+      // will be same for both gpu version and cpu version.
+      // This will help unittest to make sure they have same result.
+      MatrixPtr tmpMask = Matrix::create(outV->getHeight(), outV->getWidth());
+      tmpMask->randomizeUniform();  // generate a uniform random matrix
+      tmpMask->biggerThanScalar(config_.drop_rate());  // random mask
+      dropOutMask_->copyFrom(*tmpMask);
+    }
+    outV->dotMul(*outV, *dropOutMask_);
+  } else {  // passType == PASS_TEST
+    outV->mulScalar(1.0 - config_.drop_rate());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Layer.h b/paddle/legacy/gserver/layers/Layer.h
new file mode 100644
index 0000000000000000000000000000000000000000..65ec3bd03faaaf35065f8e2c896d4336103fc0a8
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Layer.h
@@ -0,0 +1,512 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/function/Function.h"
+#include "paddle/legacy/gserver/activations/ActivationFunction.h"
+#include "paddle/legacy/math/CpuSparseMatrix.h"
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/Weight.h"
+#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Util.h"
+
+/// Macro for registering a layer type.
+/// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
+#define REGISTER_LAYER(__type_name, __class_name) \
+  static InitFunction __reg_type_##__type_name(   \
+      []() { Layer::registrar_.registerClass<__class_name>(#__type_name); })
+
+#define REGISTER_LAYER_CREATE_FUNC(__type_name, createFunction) \
+  static InitFunction __reg_type_##__type_name(                 \
+      []() { Layer::registrar_.registerClass(#__type_name, createFunction); })
+
+namespace paddle {
+
+class Layer;
+typedef std::shared_ptr<Layer> LayerPtr;
+typedef std::map<std::string, LayerPtr> LayerMap;
+class NeuralNetwork;
+
+/// layer state, used for RNN and LSTM layers
+struct LayerState {
+  std::vector<MatrixPtr> value;
+};
+typedef std::shared_ptr<LayerState> LayerStatePtr;
+
+/// Paddle device ID, MKLDNN is -2, CPU is -1
+enum PADDLE_DEVICE_ID {
+  MKLDNN_DEVICE = -2,
+  CPU_DEVICE = -1,
+};
+
+/**
+ * @brief Base class for layer.
+ * Define necessary variables and functions for every layer.
+ */
+class Layer {
+ protected:
+  /// Layer config
+  LayerConfig config_;
+  /// whether to use GPU
+  bool useGpu_;
+  /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
+  int deviceId_;
+  /// Input layers
+  std::vector<LayerPtr> inputLayers_;
+  /// Argument of input layers
+  std::vector<std::string> inputArgument_;
+
+  /// Parameter for each input layer.
+  /// Parameters_[i] is nullptr if inputLayers_[i] does not need parameter.
+  std::vector<ParameterPtr> parameters_;
+
+  /// nullptr if bias is not needed.
+  ParameterPtr biasParameter_;
+
+  /// Output
+  Argument output_;
+  /// Several outputs stored on different devices, used in 'parallel_nn' case,
+  /// and record them by deviceId_.
+  /// Also used in 'use_mkldnn' case.
+  std::vector<Argument> outputOtherDevice_;
+  /// If there are several outputs, map them by each name.
+  /// MKLDNNLayer use it only to merge output grad
+  std::map<std::string, Argument*> outputMap_;
+  /// Used to merge grad on different devices.
+  MatrixPtr tmpGrad_;
+
+  std::unique_ptr<ActivationFunction> activation_;
+
+  /// Current passType, PASS_TRAIN or PASS_TEST
+  PassType passType_;
+
+  /// Random 0-1 matrix for dropOut
+  MatrixPtr dropOutMask_;
+
+  /// Whether the layer need to compute gradient
+  bool needGradient_;
+  /// Whether the layer need to compute re-sequence information
+  bool needSequenceInfo_;
+
+  /// Mark input grad in(true) or out(false) of backward function.
+  std::vector<bool> markInBackward_;
+
+  /// Layer forward function
+  std::vector<std::shared_ptr<FunctionBase>> forward_;
+  /// Layer backward function
+  std::vector<std::shared_ptr<FunctionBase>> backward_;
+
+ public:
+  /**
+   * Wait until all input value ready.
+   * Called before Layer::forward() function.
+   */
+  virtual void waitInputValue();
+
+  /**
+   * Copy layer's output_ to other device.
+   * If output layer is in other device, called after Layer::forward() function.
+   */
+  virtual void copyOutputToOtherDevice();
+
+  /**
+   * Wait until all output grad ready and merge them to output_.grad.
+   * Called before Layer::backward() function.
+   */
+  virtual void waitAndMergeOutputGrad();
+
+  /**
+   * Notify previous layer the output grad ready.
+   * Called after Layer::backward() function.
+   */
+  virtual void markAllInputGrad();
+
+ protected:
+  /**
+   * Create layer function. Function is called in forward or backward.
+   * \param function, Layer::forward_ or Layer::backward_
+   * \param name, function name
+   * \param config, initialization configuration for the function
+   */
+  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
+                      const std::string& name,
+                      const FuncConfig& config) {
+    if (useGpu_) {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
+    } else {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
+    }
+    auto& func = function.back();
+    func->init(config);
+  }
+
+  /**
+   * Notify specified layer the output grad ready.
+   * Called in the backward function.
+   * If do mark input grad in the backward function, you should to ensure
+   * that all input grad will be marked in the backward function.
+   */
+  void markInputGrad(int inputIndex);
+
+  /**
+   * Get the argument of input layer.
+   */
+  const Argument& getInput(size_t inputIndex) const {
+    return inputLayers_[inputIndex]->getOutput(deviceId_);
+  }
+
+  /**
+   * Get the argument of input layer.
+   */
+  const Argument& getInput(const Layer& inputLayer) const {
+    return inputLayer.getOutput(deviceId_);
+  }
+
+  /**
+   * Get the argument of input layer with deviceId.
+   */
+  const Argument& getInput(size_t inputIndex, int deviceId) const {
+    return inputLayers_[inputIndex]->getOutput(deviceId);
+  }
+
+  /**
+   * Get the forward-input value.
+   */
+  const MatrixPtr& getInputValue(int inputIndex) {
+    return inputLayers_[inputIndex]->getOutput(deviceId_).value;
+  }
+
+  /**
+   * Get the forward-input value.
+   */
+  const MatrixPtr& getInputValue(const Layer& inputLayer) {
+    return inputLayer.getOutput(deviceId_).value;
+  }
+
+  /**
+   * Get the forward-input value with deviceId.
+   */
+  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).value;
+  }
+
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(int inputIndex) {
+    return inputLayers_[inputIndex]->getOutput(deviceId_).grad;
+  }
+
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(const Layer& inputLayer) {
+    return inputLayer.getOutput(deviceId_).grad;
+  }
+
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
+  }
+
+  /**
+   * Get the forward-input label.
+   */
+  const IVectorPtr& getInputLabel(const Layer& inputLayer) {
+    return inputLayer.getOutput(deviceId_).ids;
+  }
+
+  /**
+   * Change the size of output (value, grad).
+   * Reset to value zero if isValueClean = true,
+   * Reset to grad zero if isGradClean = true.
+   */
+  void resetSpecifyOutput(Argument& output,
+                          size_t height,
+                          size_t width,
+                          bool isValueClean,
+                          bool isGradClean);
+
+  /**
+   * Add output argument to other devices.
+   */
+  void addOutputArgument(int deviceId);
+
+ public:
+  explicit Layer(const LayerConfig& config, bool useGpu = FLAGS_use_gpu);
+  virtual ~Layer() {}
+
+  /// Register a Layer
+  static ClassRegistrar<Layer, LayerConfig> registrar_;
+
+  /**
+   * Get the flag whether layer need to compute gradient.
+   */
+  bool needGradient() const { return needGradient_; }
+
+  /**
+   * Set the flag whether layer need to compute gradient.
+   */
+  void setNeedGradient(bool need) { needGradient_ = need; }
+
+  /**
+   * Set the flag whether layer need to re-compute sequence information,
+   * which includes sequenceStartPositions or subSequenceStartPositions.
+   */
+  void setNeedSequenceInfo(bool need) { needSequenceInfo_ = need; }
+
+  /**
+   * Get layer's name.
+   */
+  const std::string& getName() const { return config_.name(); }
+
+  /**
+   * Get layer's type.
+   */
+  const std::string& getType() const { return config_.type(); }
+
+  /**
+   * Get layer's size.
+   */
+  size_t getSize() const { return config_.size(); }
+
+  /**
+   * Get layer's deviceId.
+   */
+  int getDeviceId() const { return deviceId_; }
+
+  /**
+   * Add the inputLayer.
+   */
+  void addPrev(LayerPtr l) { inputLayers_.push_back(l); }
+
+  /**
+   * Get the size of inputLayer[i].
+   */
+  const LayerPtr& getPrev(size_t i) { return inputLayers_[i]; }
+
+  /**
+   * Get the forward-output value.
+   */
+  const MatrixPtr& getOutputValue() { return output_.value; }
+
+  /**
+   * Get the forward-output label.
+   */
+  const IVectorPtr& getOutputLabel() { return output_.ids; }
+
+  /**
+   * Get the backward-Loss value.
+   */
+  const MatrixPtr& getOutputGrad() { return output_.grad; }
+  /**
+   * If layer has multi-output, set output into outputMap_.
+   */
+  void setOutput(const std::string& name, Argument* output) {
+    outputMap_[name] = output;
+  }
+
+  /**
+   * Get the output map size, if layer has multi-output.
+   */
+  size_t getOutputMapSize() { return outputMap_.size(); }
+
+  /**
+   * Get the output based on layer's name.
+   */
+  Argument& getOutput(const std::string& str = "") {
+    if (str == "") {
+      return output_;
+    } else {
+      auto output = outputMap_.find(str);
+      if (output != outputMap_.end()) {
+        return *output->second;
+      } else {
+        LOG(FATAL) << "No specific output " << str;
+        return *((Argument*)nullptr);
+      }
+    }
+  }
+
+  /**
+   * Get the output based on deviceId.
+   */
+  const Argument& getOutput(int deviceId) const {
+    if (deviceId == getDeviceId()) {
+      return output_;
+    } else {
+      for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+        if (outputOtherDevice_[i].deviceId == deviceId) {
+          return outputOtherDevice_[i];
+        }
+      }
+
+      LOG(FATAL) << "No specific device output ";
+      return *((Argument*)nullptr);
+    }
+  }
+
+  /**
+   * Get layer's parameters.
+   */
+  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
+
+  /**
+   * Get layer's bias-parameters.
+   */
+  const ParameterPtr& getBiasParameter() { return biasParameter_; }
+
+  /**
+   * Create pointer of layer.
+   */
+  static LayerPtr create(const LayerConfig& config);
+
+  /**
+   * Resize the output matrix size.
+   */
+  void resizeOutput(size_t height, size_t width);
+
+  /**
+   * Resize the output matrix size,
+   * and reset value to zero.
+   */
+  void reserveOutput(size_t height, size_t width);
+
+  /**
+   * Resize the output matrix size,
+   * and reset value and grad to zero.
+   */
+  void resetOutput(size_t height, size_t width);
+
+  /**
+   * Clear the gradient of output.
+   */
+  void zeroGrad();
+
+  /**
+   * Intialization.
+   * For example, adding input layers from layerMap and parameterMap.
+   */
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  /**
+   * Intialization for sub network if there has sub network.
+   * @param rootNetwork root network
+   * @param config model config
+   * @param parameterTypes parameter's type
+   * @param useGpu whether to use gpu or not
+   */
+  virtual void initSubNetwork(NeuralNetwork* rootNetwork,
+                              const ModelConfig& config,
+                              const std::vector<ParameterType>& parameterTypes,
+                              bool useGpu) {}
+
+  /**
+   * @brief Access SubNetwork Object.
+   *        If subnetwork exists, then invoke callback with subnetwrk.
+   * @param callback if sub-network is exist, the callback is invoked.
+   */
+  virtual void accessSubNetwork(
+      const std::function<void(NeuralNetwork&)>& callback) {}
+
+  /**
+   * If use sparse row matrix as parameter,
+   * prefetch feature ids in input label.
+   */
+  virtual void prefetch() {}
+
+  /**
+   * Forward propagation.
+   * All inherited implementation should call Layer::foward() function.
+   */
+  virtual void forward(PassType passType) {
+    passType_ = passType;
+    if (!inputLayers_.empty() && needSequenceInfo_) {
+      const Argument& input = getInput(0);
+      output_.sequenceStartPositions = input.sequenceStartPositions;
+      output_.subSequenceStartPositions = input.subSequenceStartPositions;
+      output_.cpuSequenceDims = input.cpuSequenceDims;
+    }
+  }
+
+  /**
+   * Reset the internal state variables.
+   * Allocate them if they have not been allocated.
+   * This function need to called before Layer::forward() for generating
+   * sequence.
+   *
+   * This is used for sequence generation. When generating sequence, the
+   * calculation at current timestamp depends on the state from previous
+   * timestamp. The model needs to keep the information about the previous
+   * timestamp in the state variables. Layers such as RecurrentLayer,
+   * LstmLayer and ContextLayer have state variables.
+   */
+  virtual void resetState() {}
+
+  /**
+   * Set layer state.
+   */
+  virtual void setState(LayerStatePtr state) {}
+
+  /**
+   * Get layer state.
+   * @return A copy of internal state.
+   */
+  virtual LayerStatePtr getState() { return nullptr; }
+
+  /**
+   * Show output state.
+   */
+  void showOutputStats();
+
+  /**
+   * Backward propagation.
+   * Should only be called after Layer::forward() function.
+   */
+  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
+
+  /**
+   * One pass is finished.
+   */
+  virtual void onPassEnd() {}
+
+ protected:
+  /**
+   * Forward of activation function.
+   */
+  void forwardActivation();
+  /**
+   * Backward of activation function.
+   */
+  void backwardActivation();
+  /**
+   * Forward of dropOut.
+   */
+  void forwardDropOut();
+  /**
+   * Initilize the needGradient_ flag.
+   */
+  void initNeedFlags();
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/legacy/gserver/layers/LinearChainCRF.cpp
similarity index 100%
rename from paddle/gserver/layers/LinearChainCRF.cpp
rename to paddle/legacy/gserver/layers/LinearChainCRF.cpp
diff --git a/paddle/legacy/gserver/layers/LinearChainCRF.h b/paddle/legacy/gserver/layers/LinearChainCRF.h
new file mode 100644
index 0000000000000000000000000000000000000000..65e23905435da24a1a7554c30e33d303b05aef69
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LinearChainCRF.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+class LinearChainCRF {
+ public:
+  /**
+   * The size of para must be \f$(numClasses + 2) * numClasses\f$.
+   * The first numClasses values of para are for starting weights (\f$a\f$).
+   * The next numClasses values of para are for ending weights (\f$b\f$),
+   * The remaning values are for transition weights (\f$w\f$).
+   *
+   * The probability of a state sequence s of length \f$L\f$ is defined as:
+   * \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
+   *                  + \sum_{l=1}^L x_{s_l}
+   *                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
+   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
+   * all possible
+   * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
+   */
+  LinearChainCRF(int numClasses, real* para);
+
+  /**
+   * Calculate the negative log likelihood of s given x.
+   * The size of x must be length * numClasses. Each consecutive numClasses
+   * values are the features for one time step.
+   */
+  real forward(real* x, int* s, int length);
+
+  /**
+   * Calculate the gradient with respect to x, a, b, and w.
+   * backward() can only be called after a corresponding call to forward() with
+   * the same x, s and length.
+   * The gradient with respect to a, b, and w will not be calculated if
+   * needWGrad is false.
+   * @note Please call getWGrad() and getXGrad() to get the gradient with
+   * respect to (a, b, w) and x respectively.
+   */
+  void backward(real* x, int* s, int length, bool needWGrad);
+
+  /**
+   * Find the most probable sequence given x. The result will be stored in s.
+   */
+  void decode(real* x, int* s, int length);
+
+  /*
+   * Return the gradient with respect to (a, b, w). It can only be called after
+   * a corresponding call to backward().
+   */
+  MatrixPtr getWGrad() { return matWGrad_; }
+
+  /*
+   * Return the gradient with respect to x. It can only be called after a
+   * corresponding call to backward().
+   */
+  MatrixPtr getXGrad() { return matGrad_; }
+
+ protected:
+  int numClasses_;
+  MatrixPtr a_;
+  MatrixPtr b_;
+  MatrixPtr w_;
+  MatrixPtr matWGrad_;
+  MatrixPtr da_;
+  MatrixPtr db_;
+  MatrixPtr dw_;
+  MatrixPtr ones_;
+
+  MatrixPtr expX_;
+  MatrixPtr matGrad_;
+  MatrixPtr alpha_;
+  MatrixPtr beta_;
+  MatrixPtr maxX_;
+  MatrixPtr expW_;
+
+  // track_(k,i) = j means that the best sequence at time k for class i comes
+  // from the sequence at time k-1 for class j
+  IVectorPtr track_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCTC.cpp b/paddle/legacy/gserver/layers/LinearChainCTC.cpp
similarity index 100%
rename from paddle/gserver/layers/LinearChainCTC.cpp
rename to paddle/legacy/gserver/layers/LinearChainCTC.cpp
diff --git a/paddle/legacy/gserver/layers/LinearChainCTC.h b/paddle/legacy/gserver/layers/LinearChainCTC.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6c4c7bfe0cdb1bbcafbf5b847ea592eef02794a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LinearChainCTC.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+class LinearChainCTC {
+ public:
+  LinearChainCTC(int numClasses, bool normByTimes);
+
+  // Calculate the negative log probability as loss
+  real forward(real* softmaxSeq,
+               int softmaxSeqLen,
+               int* labelSeq,
+               int labelSeqLen);
+
+  // calculate the gradient
+  void backward(real* softmaxSeq,
+                real* softmaxSeqGrad,
+                int* labelSeq,
+                int labelSeqLen);
+
+ protected:
+  int numClasses_, blank_, totalSegments_, totalTime_;
+  bool normByTimes_;
+  bool isInvalid_;
+
+  MatrixPtr logActs_, forwardVars_, backwardVars_, gradTerms_;
+
+  real logProb_;
+
+  void segmentRange(int& start, int& end, int time);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmCompute.cpp b/paddle/legacy/gserver/layers/LstmCompute.cpp
similarity index 100%
rename from paddle/gserver/layers/LstmCompute.cpp
rename to paddle/legacy/gserver/layers/LstmCompute.cpp
diff --git a/paddle/gserver/layers/LstmCompute.cu b/paddle/legacy/gserver/layers/LstmCompute.cu
similarity index 100%
rename from paddle/gserver/layers/LstmCompute.cu
rename to paddle/legacy/gserver/layers/LstmCompute.cu
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/legacy/gserver/layers/LstmCompute.h
similarity index 100%
rename from paddle/gserver/layers/LstmCompute.h
rename to paddle/legacy/gserver/layers/LstmCompute.h
diff --git a/paddle/legacy/gserver/layers/LstmLayer.cpp b/paddle/legacy/gserver/layers/LstmLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb40ec05855c8e2871a5ea8181eeae480db54b1a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LstmLayer.cpp
@@ -0,0 +1,805 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "LstmLayer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+DECLARE_bool(prev_batch_state);
+
+namespace paddle {
+
+REGISTER_LAYER(lstmemory, LstmLayer);
+
+bool LstmLayer::init(const LayerMap &layerMap,
+                     const ParameterMap &parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+  CHECK_EQ(1U, parameters_.size());
+  CHECK_EQ(getSize() * getSize() * 4, parameters_[0]->getSize());
+  CHECK_EQ(getSize() * 7, biasParameter_->getSize());
+  weight_.reset(new Weight(getSize(), getSize() * 4, parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    bias_.reset(new Weight(1, getSize() * 7, biasParameter_));
+    if (bias_->getW()) {
+      localBias_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  getSize() * 4,
+                                  /* trans= */ false,
+                                  useGpu_);
+      checkIg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+      checkFg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+      checkOg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+
+      localBias_->setData(bias_->getW()->getData());
+      checkIg_->setData(bias_->getW()->getData() + getSize() * 4);
+      checkFg_->setData(bias_->getW()->getData() + getSize() * 5);
+      checkOg_->setData(bias_->getW()->getData() + getSize() * 6);
+    }
+
+    if (bias_->getWGrad()) {
+      localBiasGrad_ = Matrix::create(nullptr,
+                                      /* height= */ 1,
+                                      getSize() * 4,
+                                      /* trans= */ false,
+                                      useGpu_);
+      checkIgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      checkFgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      checkOgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      localBiasGrad_->setData(bias_->getWGrad()->getData());
+      checkIgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 4);
+      checkFgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 5);
+      checkOgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 6);
+    }
+  } else {
+    LOG(FATAL) << "Bias should be here.";
+  }
+  reversed_ = config_.reversed();
+
+  // create IdentityActivation for using drop_rate
+  activation_.reset(ActivationFunction::create(""));
+
+  LstmCompute::init(config_);
+  useBatch_ = true;
+  useSeqParallel_ = false;
+  if (useGpu_ && (getSize() == 32 || getSize() == 64)) {
+    useSeqParallel_ = true;
+  }
+
+  return true;
+}
+
+void LstmLayer::resetState() {
+  CHECK(!reversed_) << "state is not allowed for reversed lstmemory layer";
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
+  Matrix::resizeOrCreate(prevState_, 1, getSize(), /* trans= */ false, useGpu_);
+  prevOutput_->resize(0, getSize());
+  prevState_->resize(0, getSize());
+  if (FLAGS_prev_batch_state) {
+    useBatch_ = true;
+  } else {
+    useBatch_ = false;
+  }
+}
+
+void LstmLayer::setState(LayerStatePtr state) {
+  CHECK(state->value.size() == 2) << "two matrices are expected for LSTM state";
+  prevOutput_->resize(state->value[0]->getHeight(),
+                      state->value[0]->getWidth());
+  prevState_->resize(state->value[1]->getHeight(), state->value[1]->getWidth());
+  prevOutput_->copyFrom(*(state->value[0]));
+  prevState_->copyFrom(*(state->value[1]));
+}
+
+LayerStatePtr LstmLayer::getState() {
+  LayerStatePtr res = std::make_shared<LayerState>();
+  if (prevOutput_->getHeight() && prevOutput_->getWidth()) {
+    res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
+    res->value[0]->copyFrom(*prevOutput_);
+    res->value.push_back(prevState_->clone(0, 0, useGpu_));
+    res->value[1]->copyFrom(*prevState_);
+  } else {
+    MatrixPtr output =
+        Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
+    MatrixPtr state = Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
+    output->resize(0, getSize());
+    state->resize(0, getSize());
+    res->value.push_back(output);
+    res->value.push_back(state);
+  }
+  return res;
+}
+
+void LstmLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("LstmFwTimer", getName().c_str());
+  Layer::forward(passType);
+
+  const Argument &input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  resetOutput(batchSize, getSize());
+  CHECK_EQ(getSize() * 4, input.value->getWidth());
+  size_t numSequences = input.getNumSequences();
+  const int *starts = input.sequenceStartPositions->getData(false);
+  CHECK_EQ(starts[numSequences], batchSize);
+
+  Matrix::resizeOrCreate(gate_.value,
+                         /* height= */ batchSize,
+                         getSize() * 4,
+                         /* trans= */ false,
+                         useGpu_);
+  if (prevOutput_) {
+    size_t prevNumSeq = useBatch_ ? numSequences : 1;
+    if (prevOutput_->getHeight() == 0) {
+      prevOutput_->resize(prevNumSeq, getSize());
+      prevState_->resize(prevNumSeq, getSize());
+      prevOutput_->zeroMem();
+      prevState_->zeroMem();
+    } else {
+      CHECK_EQ(prevOutput_->getHeight(), prevNumSeq)
+          << "the number of sequences must be the same";
+    }
+    Matrix::resizeOrCreate(totalState_,
+                           prevState_->getHeight() + batchSize,
+                           getSize(),
+                           /*trans*/ false,
+                           useGpu_);
+    state_.value = Matrix::create(nullptr,
+                                  /* height= */ batchSize,
+                                  getSize(),
+                                  /* trans= */ false,
+                                  useGpu_);
+    state_.value->setData(totalState_->getData() +
+                          prevState_->getHeight() * getSize());
+  } else {
+    Matrix::resizeOrCreate(state_.value,
+                           /* height= */ batchSize,
+                           getSize(),
+                           /* trans= */ false,
+                           useGpu_);
+  }
+  Matrix::resizeOrCreate(preOutput_.value,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+
+  if (!useBatch_) {
+    forwardSequence(batchSize, numSequences, starts, input.value);
+  } else {
+    if (!useSeqParallel_) {
+      forwardBatch(batchSize, numSequences, starts, input.value);
+    } else {
+      const int *starts = input.sequenceStartPositions->getData(useGpu_);
+      forwardSeqParallel(batchSize, numSequences, starts, input.value);
+    }
+  }
+  /*  activation */ { forwardActivation(); }
+}
+
+void LstmLayer::backward(const UpdateCallback &callback) {
+  REGISTER_TIMER_INFO("LstmBwTimer", getName().c_str());
+  /*  Do derivation */ { backwardActivation(); }
+
+  const Argument &input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  size_t numSequences = input.getNumSequences();
+
+  Matrix::resizeOrCreate(gate_.grad,
+                         /* height= */ batchSize,
+                         getSize() * 4,
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(state_.grad,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(preOutput_.grad,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+  state_.grad->zero();
+
+  const int *starts = input.sequenceStartPositions->getData(false);
+  if (!useBatch_) {
+    backwardSequence(batchSize, numSequences, starts, input.grad);
+  } else {
+    if (!useSeqParallel_) {
+      backwardBatch(batchSize, numSequences, starts, input.grad);
+    } else {
+      const int *starts = input.sequenceStartPositions->getData(useGpu_);
+      backwardSeqParallel(batchSize, numSequences, starts, input.grad);
+    }
+  }
+
+  if (bias_) {
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void LstmLayer::forwardSequence(int batchSize,
+                                size_t numSequences,
+                                const int *starts,
+                                MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("LstmFwSequenceTime", getName().c_str());
+  gate_.value->assign(*inputValue);
+  if (bias_) {
+    gate_.value->addBias(*localBias_, 1);
+  }
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+  lstmValue.gateValue = gate_.value->getData();
+  lstmValue.stateValue = state_.value->getData();
+  lstmValue.stateActiveValue = preOutput_.value->getData();
+  lstmValue.outputValue = output_.value->getData();
+  lstmValue.prevStateValue = nullptr;
+  if (reversed_) {
+    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
+    lstmValue.stateValue += (batchSize - 1) * getSize();
+    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
+    lstmValue.outputValue += (batchSize - 1) * getSize();
+  }
+
+  auto nextFrame = [&lstmValue](bool reversed, int frameSize) {
+    lstmValue.prevStateValue = lstmValue.stateValue;
+    if (!reversed) {
+      lstmValue.gateValue += frameSize * 4;
+      lstmValue.stateValue += frameSize;
+      lstmValue.stateActiveValue += frameSize;
+      lstmValue.outputValue += frameSize;
+    } else {
+      lstmValue.gateValue -= frameSize * 4;
+      lstmValue.stateValue -= frameSize;
+      lstmValue.stateActiveValue -= frameSize;
+      lstmValue.outputValue -= frameSize;
+    }
+  };
+
+  MatrixPtr frameGate = Matrix::create(nullptr,
+                                       /* height= */ 1,
+                                       getSize() * 4,
+                                       /* trans= */ false,
+                                       useGpu_);
+  MatrixPtr frameOutput = Matrix::create(nullptr,
+                                         /* height= */ 1,
+                                         getSize(),
+                                         /* trans= */ false,
+                                         useGpu_);
+
+  if (!reversed_) {
+    if (prevState_) {
+      lstmValue.prevStateValue = prevState_->getData();
+    }
+    if (prevOutput_) {
+      frameGate->setData(lstmValue.gateValue);
+      frameGate->mul(*prevOutput_, *weight_->getW(), 1, 1);
+    }
+  }
+  AsyncGpuBlock asyncGpuBlock;
+  for (size_t n = 0; n < numSequences; ++n) {
+    int length;
+    if (!reversed_) {
+      length = starts[n + 1] - starts[n];
+    } else {
+      length = starts[numSequences - n] - starts[numSequences - n - 1];
+    }
+    for (int l = 0; l < length; ++l) {
+      if (useGpu_) {
+        LstmCompute::forwardOneSequence<1>(lstmValue, getSize());
+      } else {
+        LstmCompute::forwardOneSequence<0>(lstmValue, getSize());
+      }
+
+      if (l != length - 1) {
+        frameOutput->setData(lstmValue.outputValue);
+        nextFrame(reversed_, getSize());
+        frameGate->setData(lstmValue.gateValue);
+        frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
+      }
+    }
+    if (n != numSequences - 1) {
+      frameOutput->setData(lstmValue.outputValue);
+      nextFrame(reversed_, getSize());
+      frameGate->setData(lstmValue.gateValue);
+      if (!reversed_) {
+        if (!prevState_) lstmValue.prevStateValue = nullptr;
+        if (prevOutput_) {
+          frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
+        }
+      } else {
+        lstmValue.prevStateValue = nullptr;
+      }
+    }
+  }
+
+  if (!reversed_) {
+    if (prevState_) {
+      prevState_->assign(*state_.value->subMatrix(batchSize - 1, 1));
+    }
+    if (prevOutput_) {
+      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
+    }
+  }
+}
+
+void LstmLayer::backwardSequence(int batchSize,
+                                 size_t numSequences,
+                                 const int *starts,
+                                 MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("LstmBwSequenceTime", getName().c_str());
+  MatrixPtr weightT = weight_->getW()->getTranspose();
+
+  hl_lstm_value lstmValue;
+  hl_lstm_grad lstmGrad;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+  lstmValue.gateValue = gate_.value->getData();
+  lstmValue.stateValue = state_.value->getData();
+  lstmValue.stateActiveValue = preOutput_.value->getData();
+  lstmValue.outputValue = nullptr;
+
+  if (bias_->getWGrad()) {
+    lstmGrad.checkIgGrad = checkIgGrad_->getData();
+    lstmGrad.checkFgGrad = checkFgGrad_->getData();
+    lstmGrad.checkOgGrad = checkOgGrad_->getData();
+  } else {
+    lstmGrad.checkIgGrad = nullptr;
+    lstmGrad.checkFgGrad = nullptr;
+    lstmGrad.checkOgGrad = nullptr;
+  }
+  lstmGrad.gateGrad = gate_.grad->getData();
+  lstmGrad.stateGrad = state_.grad->getData();
+  lstmGrad.stateActiveGrad = nullptr;
+  lstmGrad.outputGrad = output_.grad->getData();
+
+  if (!reversed_) {
+    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
+    lstmGrad.gateGrad += (batchSize - 1) * getSize() * 4;
+    lstmValue.stateValue += (batchSize - 1) * getSize();
+    lstmGrad.stateGrad += (batchSize - 1) * getSize();
+    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
+    lstmGrad.outputGrad += (batchSize - 1) * getSize();
+    lstmValue.prevStateValue = lstmValue.stateValue - getSize();
+    lstmGrad.prevStateGrad = lstmGrad.stateGrad - getSize();
+  } else {
+    lstmValue.prevStateValue = lstmValue.stateValue + getSize();
+    lstmGrad.prevStateGrad = lstmGrad.stateGrad + getSize();
+  }
+
+  auto nextFrame = [&lstmValue, &lstmGrad](bool reversed, int frameSize) {
+    if (reversed) {
+      lstmValue.gateValue += frameSize * 4;
+      lstmGrad.gateGrad += frameSize * 4;
+      lstmValue.stateValue += frameSize;
+      lstmGrad.stateGrad += frameSize;
+      lstmValue.stateActiveValue += frameSize;
+      lstmGrad.outputGrad += frameSize;
+      lstmValue.prevStateValue = lstmValue.stateValue + frameSize;
+      lstmGrad.prevStateGrad = lstmGrad.stateGrad + frameSize;
+    } else {
+      lstmValue.gateValue -= frameSize * 4;
+      lstmGrad.gateGrad -= frameSize * 4;
+      lstmValue.stateValue -= frameSize;
+      lstmGrad.stateGrad -= frameSize;
+      lstmValue.stateActiveValue -= frameSize;
+      lstmGrad.outputGrad -= frameSize;
+      lstmValue.prevStateValue = lstmValue.stateValue - frameSize;
+      lstmGrad.prevStateGrad = lstmGrad.stateGrad - frameSize;
+    }
+  };
+
+  MatrixPtr frameGate = Matrix::create(nullptr,
+                                       /* height= */ 1,
+                                       getSize() * 4,
+                                       /* trans= */ false,
+                                       useGpu_);
+  MatrixPtr frameOutput = Matrix::create(nullptr,
+                                         /* height= */ 1,
+                                         getSize(),
+                                         /* trans= */ false,
+                                         useGpu_);
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    for (size_t n = 0; n < numSequences; ++n) {
+      int length;
+      int start;
+      if (reversed_) {
+        length = starts[n + 1] - starts[n];
+        start = starts[n];
+      } else {
+        length = starts[numSequences - n] - starts[numSequences - n - 1];
+        start = starts[numSequences - n - 1];
+      }
+      for (int l = 0; l < length; ++l) {
+        if (l == length - 1) {
+          lstmValue.prevStateValue = nullptr;
+          lstmGrad.prevStateGrad = nullptr;
+        }
+        if (useGpu_) {
+          LstmCompute::backwardOneSequence<1>(lstmValue, lstmGrad, getSize());
+        } else {
+          LstmCompute::backwardOneSequence<0>(lstmValue, lstmGrad, getSize());
+        }
+        if (l != length - 1) {
+          frameGate->setData(lstmGrad.gateGrad);
+          nextFrame(reversed_, getSize());
+          frameOutput->setData(lstmGrad.outputGrad);
+          frameOutput->mul(*frameGate, *weightT, 1, 1);
+        } else {
+          nextFrame(reversed_, getSize());
+        }
+      }
+
+      if (weight_->getWGrad()) {
+        if (!reversed_) {
+          weight_->getWGrad()->mul(
+              *output_.value->subMatrix(start, length - 1)->getTranspose(),
+              *gate_.grad->subMatrix(start + 1, length - 1),
+              1,
+              1);
+        } else {
+          weight_->getWGrad()->mul(
+              *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
+              *gate_.grad->subMatrix(start, length - 1),
+              1,
+              1);
+        }
+      }
+    }
+  }
+
+  if (inputGrad) {
+    inputGrad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, 1);
+  }
+}
+
+void LstmLayer::forwardBatch(int batchSize,
+                             size_t numSequences,
+                             const int *starts,
+                             MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("LstmFwBatchTime", getName().c_str());
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+
+  if (!batchValue_) {
+    batchValue_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchValue_->resizeOrCreateBatch(
+      batchSize, numSequences, starts, reversed_, prevOutput_ ? true : false);
+
+  batchValue_->resizeOrCreate(*output_.value);
+  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
+  if (bias_) {
+    gate_.value->addBias(*localBias_, 1);
+  }
+
+  {
+    int numBatch = batchValue_->getNumBatch();
+    int batchSize = 0;
+    AsyncGpuBlock asyncGpuBlock;
+    if (prevState_) {
+      lstmValue.prevStateValue = totalState_->getData();
+    } else {
+      lstmValue.prevStateValue = nullptr;
+    }
+    for (int n = 0; n < numBatch; n++) {
+      MatrixPtr outputValue = batchValue_->getBatchValue(n);
+      MatrixPtr gateValue = batchValue_->getBatchValue(*gate_.value, n);
+      batchSize = outputValue->getHeight();
+
+      if (n != 0) {
+        MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize);
+        gateValue->mul(*batch1, *weight_->getW(), 1, 1);
+      } else if (prevOutput_) {
+        Matrix::resizeOrCreate(prevBatchOutput2_,
+                               gateValue->getHeight(),
+                               getSize(),
+                               false,
+                               useGpu_);
+        batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_);
+        gateValue->mul(*prevBatchOutput2_, *weight_->getW(), 1, 1);
+
+        batchValue_->prevOutput2Batch(*prevState_,
+                                      *totalState_->subMatrix(0, numSequences));
+      }
+
+      lstmValue.gateValue = gateValue->getData();
+      lstmValue.outputValue = outputValue->getData();
+      lstmValue.stateValue =
+          batchValue_->getBatchValue(*state_.value, n)->getData();
+      lstmValue.stateActiveValue =
+          batchValue_->getBatchValue(*preOutput_.value, n)->getData();
+      {
+        if (useGpu_) {
+          LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
+        } else {
+          LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
+        }
+      }
+      lstmValue.prevStateValue = lstmValue.stateValue;
+    }
+  }
+  {
+    REGISTER_TIMER_INFO("batchToSeq", getName().c_str());
+    batchValue_->copyBackSeq(*output_.value);
+  }
+  if (prevOutput_) {
+    getPrevBatchOutput(numSequences);
+    getPrevBatchState(numSequences);
+  }
+}
+
+void LstmLayer::getPrevBatchOutput(size_t numSequences) {
+  prevOutput_->resize(numSequences, getSize());
+  batchValue_->getSeqOutputFromBatch(*prevOutput_,
+                                     *batchValue_->getBatchValue());
+}
+
+void LstmLayer::getPrevBatchState(size_t numSequences) {
+  prevState_->resize(numSequences, getSize());
+  batchValue_->getSeqOutputFromBatch(*prevState_, *state_.value);
+}
+
+void LstmLayer::backwardBatch(int batchSize,
+                              size_t numSequences,
+                              const int *starts,
+                              MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("LstmBwBatchTime", getName().c_str());
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+
+  hl_lstm_grad lstmGrad;
+  lstmGrad.stateActiveGrad = preOutput_.grad->getData();
+
+  if (bias_->getWGrad()) {
+    lstmGrad.checkIgGrad = checkIgGrad_->getData();
+    lstmGrad.checkFgGrad = checkFgGrad_->getData();
+    lstmGrad.checkOgGrad = checkOgGrad_->getData();
+  } else {
+    lstmGrad.checkIgGrad = nullptr;
+    lstmGrad.checkFgGrad = nullptr;
+    lstmGrad.checkOgGrad = nullptr;
+  }
+
+  if (!batchGrad_) {
+    batchGrad_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchGrad_->shareIndexWith(*batchValue_);
+
+  {
+    REGISTER_TIMER_INFO("seqToBatch", getName().c_str());
+    batchGrad_->copyFromSeq(*output_.grad);
+  }
+
+  {
+    MatrixPtr weightT = weight_->getW()->getTranspose();
+    int numBatch = batchGrad_->getNumBatch();
+    int batchSize = 0;
+    AsyncGpuBlock asyncGpuBlock;
+    for (int n = (int)numBatch - 1; n >= 0; n--) {
+      MatrixPtr outputGrad = batchGrad_->getBatchValue(n);
+      MatrixPtr gateGrad = batchGrad_->getBatchValue(*gate_.grad, n);
+
+      lstmValue.gateValue =
+          batchGrad_->getBatchValue(*gate_.value, n)->getData();
+      lstmValue.stateValue =
+          batchGrad_->getBatchValue(*state_.value, n)->getData();
+      lstmValue.stateActiveValue =
+          batchGrad_->getBatchValue(*preOutput_.value, n)->getData();
+      lstmGrad.stateGrad =
+          batchGrad_->getBatchValue(*state_.grad, n)->getData();
+      lstmGrad.gateGrad = gateGrad->getData();
+      lstmGrad.outputGrad = outputGrad->getData();
+      {
+        batchSize = outputGrad->getHeight();
+        if (n != 0) {
+          lstmValue.prevStateValue =
+              batchGrad_->getBatchValue(*state_.value, n - 1)->getData();
+          lstmGrad.prevStateGrad =
+              batchGrad_->getBatchValue(*state_.grad, n - 1)->getData();
+        } else {
+          if (prevState_) {
+            lstmValue.prevStateValue = totalState_->getData();
+            lstmGrad.prevStateGrad = nullptr;
+          } else {
+            lstmValue.prevStateValue = nullptr;
+            lstmGrad.prevStateGrad = nullptr;
+          }
+        }
+        if (useGpu_) {
+          LstmCompute::backwardBatch<1>(
+              lstmValue, lstmGrad, getSize(), batchSize);
+        } else {
+          LstmCompute::backwardBatch<0>(
+              lstmValue, lstmGrad, getSize(), batchSize);
+        }
+      }
+
+      if (n != 0) {
+        MatrixPtr tmp = batchGrad_->getBatchValue(n - 1, batchSize);
+        tmp->mul(*gateGrad, *weightT, 1, 1);
+      }
+
+      if (n != 0 && weight_->getWGrad()) {
+        /* backward weight */
+        MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize);
+        weight_->getWGrad()->mul(*outputValue->getTranspose(), *gateGrad, 1, 1);
+      } else if (prevOutput_ && weight_->getWGrad()) {
+        weight_->getWGrad()->mul(
+            *prevBatchOutput2_->getTranspose(), *gateGrad, 1, 1);
+      }
+    }
+  }
+
+  if (inputGrad) {
+    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, /* scale */ 1);
+  }
+}
+
+void LstmLayer::forwardSeqParallel(int batchSize,
+                                   size_t numSequences,
+                                   const int *starts,
+                                   MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("LstmFwSeqParallelTime", getName().c_str());
+  gate_.value->assign(*inputValue);
+  if (bias_) {
+    gate_.value->addBias(*localBias_, /* scale */ 1);
+  }
+
+  real *gateValue = gate_.value->getData();
+  real *stateValue = state_.value->getData();
+  real *outputValue = output_.value->getData();
+  real *preOutputValue = preOutput_.value->getData();
+  real *checkIg = checkIg_->getData();
+  real *checkFg = checkFg_->getData();
+  real *checkOg = checkOg_->getData();
+  real *weight = weight_->getW()->getData();
+  hl_lstm_parallel_forward(gateValue,
+                           stateValue,
+                           preOutputValue,
+                           outputValue,
+                           checkIg,
+                           checkFg,
+                           checkOg,
+                           weight,
+                           starts,
+                           getSize(),
+                           numSequences,
+                           reversed_,
+                           activeNode_,
+                           activeGate_,
+                           activeState_);
+}
+
+void LstmLayer::backwardSeqParallel(int batchSize,
+                                    size_t numSequences,
+                                    const int *starts,
+                                    MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("LstmBwSeqParallelTime", getName().c_str());
+  real *gateValue = gate_.value->getData();
+  real *gateGrad = gate_.grad->getData();
+  real *stateValue = state_.value->getData();
+  real *stateGrad = state_.grad->getData();
+  real *preOutputValue = preOutput_.value->getData();
+  real *preOutputGrad = preOutput_.grad->getData();
+  real *checkIg = checkIg_->getData();
+  real *checkFg = checkFg_->getData();
+  real *checkOg = checkOg_->getData();
+  real *outputGrad = output_.grad->getData();
+  real *weight = weight_->getW()->getData();
+
+  real *checkIgGrad;
+  real *checkFgGrad;
+  real *checkOgGrad;
+  if (bias_->getWGrad()) {
+    checkIgGrad = checkIgGrad_->getData();
+    checkFgGrad = checkFgGrad_->getData();
+    checkOgGrad = checkOgGrad_->getData();
+  } else {
+    checkIgGrad = nullptr;
+    checkFgGrad = nullptr;
+    checkOgGrad = nullptr;
+  }
+
+  hl_lstm_parallel_backward_data(gateValue,
+                                 gateGrad,
+                                 stateValue,
+                                 stateGrad,
+                                 preOutputValue,
+                                 preOutputGrad,
+                                 outputGrad,
+                                 checkIg,
+                                 checkIgGrad,
+                                 checkFg,
+                                 checkFgGrad,
+                                 checkOg,
+                                 checkOgGrad,
+                                 weight,
+                                 starts,
+                                 getSize(),
+                                 numSequences,
+                                 reversed_,
+                                 activeNode_,
+                                 activeGate_,
+                                 activeState_);
+
+  if (inputGrad) {
+    inputGrad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, 1);
+  }
+
+  real *outputValue = output_.value->getData();
+  if (weight_->getWGrad()) {
+    real *weightGrad = weight_->getWGrad()->getData();
+    hl_lstm_parallel_backward_weight(weightGrad,
+                                     outputValue,
+                                     gateGrad,
+                                     starts,
+                                     getSize(),
+                                     batchSize,
+                                     numSequences,
+                                     reversed_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmLayer.h b/paddle/legacy/gserver/layers/LstmLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c8b382f505d791fb1ef4265dcfe95046aa832fb
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LstmLayer.h
@@ -0,0 +1,221 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "LstmCompute.h"
+#include "SequenceToBatch.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+namespace paddle {
+
+/**
+ * @brief LstmLayer takes 1 input layer with size * 4.
+ * Input layer is diveded into 4 equal parts:
+ *   (input_s, input_ig, input_fg, input_og)
+ *
+ * For each sequence [start, end] it performs the following computation:
+ * @code
+ * output_{i} = actState(state_{i}) * actGate(outputGate_{i})
+ * state_{i} = actInput(input_s_{i} + bias_s +
+ *             output_{i-1} * recurrIW) * actGate(inputGate_{i}) +
+ *             actGate(forgetGate_{i}) * state_{i-1}
+ * inputGate = input_ig_{i} + bias_ig + output_{i-1} * recurrIGW +
+ *             state_{i-1} * inputCheck
+ * ouputGate = input_og_{i} + bias_og + output_{i-1} * recurrOGW +
+ *             state_{i} * outputCheck
+ * forgetGate = input_fg_{i} + bias_fg + output_{i-1} * recurrFGW +
+ *              state_{i-1} * forgetCheck
+ * @endcode
+ *
+ * - parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW)
+ * - baisParameter consists of
+ *   (bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck)
+ *
+ * - actInput is defined by config active_type.
+ * - actState is defined by config active_state_type.
+ * - actGate is defined by config actvie_gate_type.
+ *
+ * There are two ways to compute, namely one sequence by one sequence or
+ * one batch by one batch. By default and no setting pre_batch_state true,
+ * it will compute batch by batch.
+ *
+ * The formula in the paper is as follows:
+ * \f[
+ * i_t = \sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) \\
+ * f_t = \sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) \\
+ * \tilde{c_t} = tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) \\
+ * o_t = \sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) \\
+ * c_t = f_t * c_{t-1} + i_t * \tilde{c_t} \\
+ * h_t = o_t tanh(c_t)
+ * \f]
+ *
+ * @note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
+ * operations on the input sequence were NOT included in LstmLayer. So
+ * users should use fc_layer or mixed_layer before lstm_later.
+ *
+ * The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
+ * The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
+ */
+
+class LstmLayer : public Layer, public LstmCompute {
+ public:
+  explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
+
+  bool init(const LayerMap &layerMap,
+            const ParameterMap &parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback &callback) override;
+
+  void resetState() override;
+
+  void setState(LayerStatePtr state) override;
+
+  LayerStatePtr getState() override;
+
+ protected:
+  /**
+   * @brief Compute lstm forward one sequence by one sequence.
+   * @param batchSize The batchSize is not equal to the batch_size in
+   * the config file. It is the total words number of all samples
+   * in this forward batch.
+   * @param numSequences The sample number. It is equal to the batch_size
+   * in the config file.
+   * @param starts Each start position of each samples.
+   * @param inputValue The input values.
+   */
+  void forwardSequence(int batchSize,
+                       size_t numSequences,
+                       const int *starts,
+                       MatrixPtr inputValue);
+  /**
+   * Compute lstm backward one sequence by one sequence.
+   */
+  void backwardSequence(int batchSize,
+                        size_t numSequences,
+                        const int *starts,
+                        MatrixPtr inputGrad);
+
+  /**
+   * Compute lstm forward one batch by one batch. The batch value is
+   * reorganized by SequenceToBatch class. The batch output value will
+   * be convert into sequence value after finishing forward. Here, one
+   * batch contains one word of each sample. If the length of each sample
+   * is not equality, the batch will not pads zero and contains less words.
+   * The total batch numbers are the max length of the sequence. The details
+   * can refer to SequenceToBatch class. On GPU mode, it will launch GPU
+   * kernel for loop.
+   *
+   * @code
+   * for (int i = 0; i < numBatch(max_sequence_length); ++i) {
+   *   compute one batch.
+   * }
+   * @endcode
+   */
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int *starts,
+                    MatrixPtr inputValue);
+  /**
+   * Compute lstm backward one batch by one batch.
+   */
+  void backwardBatch(int batchSize,
+                     size_t numSequences,
+                     const int *starts,
+                     MatrixPtr inputGrad);
+
+  /**
+   * This function only supports GPU. It not need to reorganize input into
+   * batch value. It will launch one kernel to parallelly compute forward
+   * propagation in sequence level.
+   */
+  void forwardSeqParallel(int batchSize,
+                          size_t numSequences,
+                          const int *starts,
+                          MatrixPtr inputValue);
+  /**
+   * Backward propagation corresponding to forwardSeqParallel.
+   */
+  void backwardSeqParallel(int batchSize,
+                           size_t numSequences,
+                           const int *starts,
+                           MatrixPtr inputGrad);
+  /**
+   * This function is used for sequence generation and get output after
+   * forwardBatch.
+   */
+  void getPrevBatchOutput(size_t numSequences);
+  /**
+   * This function is used for sequence generation and get state after
+   * forwardBatch.
+   */
+  void getPrevBatchState(size_t numSequences);
+
+ protected:
+  /// Learned parameters, shape: (size, 4*size).
+  /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
+  std::unique_ptr<Weight> weight_;
+  /// Learned bias parameter, shape: (1, 7 * size).
+  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf},
+  /// W_{co}\f$.
+  std::unique_ptr<Weight> bias_;
+  /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
+  MatrixPtr localBias_;
+  /// The peephole connection for input gate.
+  MatrixPtr checkIg_;
+  /// The peephole connection for forget gate.
+  MatrixPtr checkFg_;
+  /// The peephole connection for output gate.
+  MatrixPtr checkOg_;
+  /// The gradient of real bias
+  MatrixPtr localBiasGrad_;
+  /// The gradient of peephole connection for input gates.
+  MatrixPtr checkIgGrad_;
+  /// The gradient of peephole connection for forget gates.
+  MatrixPtr checkFgGrad_;
+  /// The gradient of peephole connection for output gates.
+  MatrixPtr checkOgGrad_;
+
+  /// Stores the cell state of previous time step, namely \f$c_{t-1}\f$.
+  Argument state_;
+  /// Stores the hidden of previous time step, namely \f$h_{t-1}\f$.
+  Argument preOutput_;
+  /// Stores the value and gradient of four gates, namely
+  /// \f$i_t, f_t, o_t, c_t\f$.
+  Argument gate_;
+  /// Whether it is reversed lstm.
+  bool reversed_;
+  /// Whether to use batch method to compute.
+  bool useBatch_;
+  /// Whether to use sequence parallell method to compute.
+  bool useSeqParallel_;
+  /// batchValue_ is used in method of batch calculation. It stores the
+  /// batch value after reorganized input.
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  /// The gradient of batchValue_.
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+
+  /// Used in generation and stores the state of previous time step.
+  MatrixPtr prevState_;
+  /// Used in generation and stores the output of previous time step.
+  MatrixPtr prevOutput_;
+  MatrixPtr prevBatchOutput2_;
+  /// The total state.
+  MatrixPtr totalState_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/legacy/gserver/layers/LstmStepLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/LstmStepLayer.cpp
rename to paddle/legacy/gserver/layers/LstmStepLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MDLstmLayer.cpp b/paddle/legacy/gserver/layers/MDLstmLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4838183e8ccb213aa249fddf5102026198e98d3c
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MDLstmLayer.cpp
@@ -0,0 +1,769 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "LstmLayer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+class CoordIterator {
+ public:
+  std::vector<int> dims_;
+  std::vector<bool> directions_;
+  std::vector<int> curPos_;
+  bool end_;
+
+  void step(size_t d, bool reversed) {
+    if (directions_[d] ^ reversed) {
+      if (curPos_[d] == dims_[d] - 1) {
+        curPos_[d] = 0;
+        if (d) {
+          step(d - 1, reversed);
+        } else {
+          end_ = true;
+        }
+      } else {
+        curPos_[d]++;
+      }
+    } else {
+      if (curPos_[d] == 0) {
+        curPos_[d] = dims_[d] - 1;
+        if (d) {
+          step(d - 1, reversed);
+        } else {
+          end_ = true;
+        }
+      } else {
+        curPos_[d]--;
+      }
+    }
+  }
+
+ public:
+  CoordIterator(std::vector<int> dim, std::vector<bool> directions)
+      : dims_(dim), directions_(directions), end_(false) {
+    CHECK_EQ(dims_.size(), directions_.size());
+    for (size_t i = 0; i < dims_.size(); i++) {
+      curPos_.push_back(-1);
+    }
+  }
+  CoordIterator& operator++() {
+    step(dims_.size() - 1, false);
+    return *this;
+  }
+
+  CoordIterator& operator--() {
+    step(dims_.size() - 1, true);
+    return *this;
+  }
+
+  std::vector<int>& curPos() { return curPos_; }
+
+  int offset() {
+    int offset = curPos_[0];
+    for (size_t i = 1; i < dims_.size(); i++) {
+      offset = offset * dims_[i] + curPos_[i];
+    }
+    return offset;
+  }
+
+  int offset(const std::vector<int>& pos) {
+    int offset = pos[0];
+    for (size_t i = 1; i < dims_.size(); i++) {
+      offset = offset * dims_[i] + pos[i];
+    }
+    return offset;
+  }
+
+  std::vector<int>& begin() {
+    for (size_t i = 0; i < dims_.size(); i++) {
+      curPos_[i] = directions_[i] ? 0 : dims_[i] - 1;
+    }
+    end_ = false;
+    return curPos_;
+  }
+
+  std::vector<int>& rbegin() {
+    for (size_t i = 0; i < dims_.size(); i++) {
+      curPos_[i] = directions_[i] ? dims_[i] - 1 : 0;
+    }
+    end_ = false;
+    return curPos_;
+  }
+
+  bool end() { return end_; }
+
+  bool getPrePos(const std::vector<int>& delays,
+                 int idx,
+                 std::vector<int>& prePos) {
+    bool isAvial = true;
+    prePos.clear();
+    prePos.reserve(directions_.size());
+    for (size_t i = 0; i < directions_.size(); i++) {
+      if (int(i) == idx) {
+        prePos.push_back(curPos_[i] + delays[i] * (directions_[i] ? 1 : -1));
+        if (prePos[i] < 0) {
+          prePos[i] = 0;
+          isAvial = false;
+        }
+        if (prePos[i] >= dims_[i]) {
+          prePos[i] = dims_[i] - 1;
+          isAvial = false;
+        }
+      } else {
+        prePos.push_back(curPos_[i]);
+      }
+    }
+    return isAvial;
+  }
+
+  bool getNextPos(const std::vector<int>& delays,
+                  int idx,
+                  std::vector<int>& nextPos) {
+    bool isAvial = true;
+    nextPos.clear();
+    nextPos.reserve(directions_.size());
+    for (size_t i = 0; i < directions_.size(); i++) {
+      if (int(i) == idx) {
+        nextPos.push_back(curPos_[i] - delays[i] * (directions_[i] ? 1 : -1));
+        if (nextPos[i] < 0) {
+          nextPos[i] = 0;
+          isAvial = false;
+        }
+        if (nextPos[i] >= dims_[i]) {
+          nextPos[i] = dims_[i] - 1;
+          isAvial = false;
+        }
+      } else {
+        nextPos.push_back(curPos_[i]);
+      }
+    }
+    return isAvial;
+  }
+};
+/*
+ * MDLstmLayer takes 1 input layer with size * (3+numDims).
+ * For each sequence [start, end] it performs the following computation:
+ * out_i = actState(state_i) * actGate(outputGate_i)
+ *
+ * For example the image with 2 dims, we take the scanning order from left-top
+ * to right-bottom, then the 2 previous states of the current pixels are the
+ * ones located at left and top. And each of them has a independent forget gate.
+ *
+ * state_i = actInput(input_i) * actGate(inputGate_i) +
+ *           \sum{j}(actGate(forgetGate_i_j) * state_prev_i_j)
+ *
+ * inputGate = input_i * inputW + \sum{j}(output_prev_i_j * recurrInputW_j) +
+ *             \sum{j}(state_prev_i_j * inputCheck_j)
+ *
+ * ouputGate = input_i * outputW + \sum{j}(output_prev_i_j * recurrOutputW_j) +
+ *             state_i * outputCheck
+ *
+ * forgetGate_j = input_i * forgetW_j + \sum{j}(output_prev_i_j *
+ *                recurrForgetW_j) + \sum{j}(state_prev_i_j * forgetCheck_j)
+ *
+ * IG Layer: (Input, InputGate, ForgetGates, OutputGate) * OutputSize
+ * */
+
+class MDLstmLayer : public LstmLayer {
+ public:
+  explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+ protected:
+  void forwardOneSequence(int start, CoordIterator& coordIter);
+  void backwardOneSequence(int start, CoordIterator& coordIter);
+  void forwardGate2OutputSequence(int start, CoordIterator& coordIter);
+  void backwardGate2OutputSequence(int start, CoordIterator& coordIter);
+
+ protected:
+  std::vector<Argument> frameInputGate_;
+  std::vector<Argument> frameForgetGate_;
+  std::vector<Argument> frameOutputGate_;
+  std::vector<Argument> frameInputNode_;
+  std::vector<Argument> frameGate_;
+  std::vector<Argument> frameState_;
+  std::vector<Argument> framePreOutput_;
+  std::vector<Argument> frameOutput_;
+
+  // Activation
+  std::unique_ptr<ActivationFunction> activationGate_;
+  std::unique_ptr<ActivationFunction> activationState_;
+
+  int numDims_;
+  size_t numBlocks_;
+  std::vector<bool> directions_;
+  std::vector<int> delays_;
+  std::vector<std::vector<int>> dimsV_;
+};
+
+REGISTER_LAYER(mdlstmemory, MDLstmLayer);
+
+bool MDLstmLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+  CHECK_EQ(1U, parameters_.size());
+
+  numBlocks_ = getSize();
+  numDims_ = config_.directions_size();
+  CHECK_EQ(numBlocks_ * numBlocks_ * (3 + numDims_), parameters_[0]->getSize());
+
+  // inode(1), ig(1), fg(numDims_), og(1), peepIg(1), peepFg(numDims_),
+  // peepOg(1), then size of localBias_ is 3+numDims_
+  CHECK_EQ(numBlocks_ * (5 + 2 * numDims_), biasParameter_->getSize());
+  weight_.reset(
+      new Weight(numBlocks_, numBlocks_ * (3 + numDims_), parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    bias_.reset(new Weight(1, numBlocks_ * (5 + 2 * numDims_), biasParameter_));
+    localBias_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                numBlocks_ * (3 + numDims_),
+                                /* trans= */ false,
+                                useGpu_);
+    checkIg_ = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    checkFg_ = Matrix::create(nullptr,
+                              /* height= */ numDims_,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    checkOg_ = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    localBiasGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    numBlocks_ * (3 + numDims_),
+                                    /* trans= */ false,
+                                    useGpu_);
+    checkIgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+    checkFgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ numDims_,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+    checkOgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+
+    localBias_->setData(bias_->getW()->getData());
+    checkIg_->setData(bias_->getW()->getData() + numBlocks_ * (3 + numDims_));
+    checkFg_->setData(bias_->getW()->getData() + numBlocks_ * (4 + numDims_));
+    checkOg_->setData(bias_->getW()->getData() +
+                      numBlocks_ * (4 + 2 * numDims_));
+
+    if (bias_->getWGrad()) {
+      localBiasGrad_->setData(bias_->getWGrad()->getData());
+      checkIgGrad_->setData(bias_->getWGrad()->getData() +
+                            numBlocks_ * (3 + numDims_));
+      checkFgGrad_->setData(bias_->getWGrad()->getData() +
+                            numBlocks_ * (4 + numDims_));
+      checkOgGrad_->setData(bias_->getWGrad()->getData() +
+                            numBlocks_ * (4 + 2 * numDims_));
+    }
+  } else {
+    LOG(FATAL) << "Bias should be here.";
+  }
+  for (int i = 0; i < numDims_; i++) {
+    directions_.push_back(config_.directions(i));
+  }
+  for (int i = 0; i < numDims_; i++) {
+    delays_.push_back(-1);
+  }
+  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
+  activationState_.reset(
+      ActivationFunction::create(config_.active_state_type()));
+
+  return true;
+}
+
+void MDLstmLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  int numSequences = input.getNumSequences();
+  resetOutput(batchSize, numBlocks_);
+  CHECK_EQ(numBlocks_ * (3 + numDims_), input.value->getWidth());
+  const int* starts = input.sequenceStartPositions->getData(false);
+  CHECK_EQ(starts[numSequences], batchSize);
+
+  int* dimsData = input.cpuSequenceDims->getData();
+  CHECK_EQ(int(input.cpuSequenceDims->getSize()), numDims_* numSequences);
+
+  for (int i = 0; i < numSequences; i++) {
+    std::vector<int> dims;
+    for (int j = 0; j < numDims_; j++) {
+      dims.push_back(dimsData[i * numDims_ + j]);
+    }
+    dimsV_.push_back(dims);
+  }
+
+  frameInputGate_.reserve(batchSize);
+  frameForgetGate_.reserve(batchSize);
+  frameOutputGate_.reserve(batchSize);
+  frameInputNode_.reserve(batchSize);
+  frameGate_.reserve(batchSize);
+  frameState_.reserve(batchSize);
+  framePreOutput_.reserve(batchSize);
+  frameOutput_.reserve(batchSize);
+
+  Matrix::resizeOrCreate(gate_.value,
+                         /* height= */ batchSize,
+                         numBlocks_ * (3 + numDims_),
+                         /* trans= */ false,
+                         useGpu_);
+
+  for (int i = frameGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_ * (3 + numDims_),
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_ * (3 + numDims_),
+                              /* trans= */ false,
+                              useGpu_);
+    frameGate_.push_back(arg);
+  }
+  for (int i = frameInputGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameInputGate_.push_back(arg);
+  }
+  for (int i = frameForgetGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ numDims_,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ numDims_,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameForgetGate_.push_back(arg);
+  }
+  for (int i = frameOutputGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameOutputGate_.push_back(arg);
+  }
+  for (int i = frameInputNode_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameInputNode_.push_back(arg);
+  }
+  for (int i = frameState_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(
+        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+    frameState_.push_back(arg);
+  }
+  for (int i = framePreOutput_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(
+        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+    framePreOutput_.push_back(arg);
+  }
+  for (int i = frameOutput_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameOutput_.push_back(arg);
+  }
+
+  for (int i = 0; i < batchSize; i++) {
+    frameOutput_[i].value->setData(output_.value->getData() + i * numBlocks_);
+    frameGate_[i].value->setData(gate_.value->getData() +
+                                 i * numBlocks_ * (3 + numDims_));
+    frameInputNode_[i].value->setData(gate_.value->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * 0);
+    frameInputGate_[i].value->setData(gate_.value->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * 1);
+    frameForgetGate_[i].value->setData(gate_.value->getData() +
+                                       i * numBlocks_ * (3 + numDims_) +
+                                       numBlocks_ * 2);
+    frameOutputGate_[i].value->setData(gate_.value->getData() +
+                                       i * numBlocks_ * (3 + numDims_) +
+                                       numBlocks_ * (2 + numDims_));
+  }
+
+  AsyncGpuBlock asyncGpuBlock;
+  gate_.value->assign(*input.value);
+
+  if (bias_) {
+    gate_.value->addBias(*localBias_, 1);
+  }
+
+  for (int i = 0; i < numSequences; i++) {
+    CoordIterator coordIter(dimsV_[i], directions_);
+    forwardOneSequence(starts[i], coordIter);
+  }
+}
+
+void MDLstmLayer::forwardGate2OutputSequence(int start,
+                                             CoordIterator& coordIter) {
+  int idxCurr = start + coordIter.offset();
+  std::vector<int> preOffsetV;
+  preOffsetV.reserve(numDims_);
+  for (int i = 0; i < numDims_; i++) {
+    std::vector<int> prePos;
+    if (coordIter.getPrePos(delays_, i, prePos)) {
+      preOffsetV[i] = coordIter.offset(prePos);
+    } else {
+      preOffsetV[i] = -1;
+    }
+  }
+
+  for (int i = 0; i < numDims_; i++) {
+    if (preOffsetV[i] >= 0) {
+      frameInputGate_[idxCurr].value->addDotMul(
+          *frameState_[start + preOffsetV[i]].value, *checkIg_, 1.0, 1.0);
+
+      MatrixPtr fgGateOneDim = Matrix::create(
+          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      MatrixPtr checkFgOneDim =
+          Matrix::create(checkFg_->getData() + i * numBlocks_,
+                         1.0,
+                         numBlocks_,
+                         false,
+                         useGpu_);
+      fgGateOneDim->addDotMul(
+          *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
+    }
+  }
+  auto status = activationGate_->forward(frameInputGate_[idxCurr]);
+  status.check();
+  status = activationGate_->forward(frameForgetGate_[idxCurr]);
+  status.check();
+  status = activation_->forward(frameInputNode_[idxCurr]);
+  status.check();
+
+  frameState_[idxCurr].value->zeroMem();
+  for (int i = 0; i < numDims_; i++) {
+    if (preOffsetV[i] >= 0) {
+      MatrixPtr fgGateOneDim = Matrix::create(
+          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      frameState_[idxCurr].value->addDotMul(
+          *frameState_[start + preOffsetV[i]].value, *fgGateOneDim, 1.0, 1.0);
+    }
+  }
+  frameState_[idxCurr].value->addDotMul(*frameInputNode_[idxCurr].value,
+                                        *frameInputGate_[idxCurr].value,
+                                        1.0,
+                                        1.0);
+
+  frameOutputGate_[idxCurr].value->addDotMul(
+      *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
+  status = activationGate_->forward(frameOutputGate_[idxCurr]);
+  status.check();
+
+  framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
+  status = activationState_->forward(framePreOutput_[idxCurr]);
+  status.check();
+
+  frameOutput_[idxCurr].value->dotMul(*framePreOutput_[idxCurr].value,
+                                      *frameOutputGate_[idxCurr].value);
+}
+
+void MDLstmLayer::forwardOneSequence(int start, CoordIterator& coordIter) {
+  for (coordIter.begin(); !coordIter.end(); ++coordIter) {
+    int offset = coordIter.offset();
+    for (int i = 0; i < numDims_; i++) {
+      std::vector<int> prePos;
+      if (coordIter.getPrePos(delays_, i, prePos)) {
+        int preOffset = coordIter.offset(prePos);
+        frameGate_[start + offset].value->mul(
+            *frameOutput_[start + preOffset].value, *weight_->getW(), 1.0, 1.0);
+      }
+    }
+    forwardGate2OutputSequence(start, coordIter);
+  }
+}
+
+void MDLstmLayer::backward(const UpdateCallback& callback) {
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  const int* starts = input.sequenceStartPositions->getData(false);
+  size_t numSequences = input.getNumSequences();
+
+  Matrix::resizeOrCreate(gate_.grad,
+                         /* height= */ batchSize,
+                         numBlocks_ * (3 + numDims_),
+                         /* trans= */ false,
+                         useGpu_);
+
+  for (int i = 0; i < batchSize; i++) {
+    if (frameState_[i].grad == NULL)
+      frameState_[i].grad = Matrix::create(
+          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+  }
+  for (int i = 0; i < batchSize; i++) {
+    if (framePreOutput_[i].grad == NULL)
+      framePreOutput_[i].grad = Matrix::create(
+          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+  }
+
+  for (int i = 0; i < batchSize; i++) {
+    frameOutput_[i].grad->setData(output_.grad->getData() + i * numBlocks_);
+    frameGate_[i].grad->setData(gate_.grad->getData() +
+                                i * numBlocks_ * (3 + numDims_));
+    frameInputNode_[i].grad->setData(gate_.grad->getData() +
+                                     i * numBlocks_ * (3 + numDims_) +
+                                     numBlocks_ * 0);
+    frameInputGate_[i].grad->setData(gate_.grad->getData() +
+                                     i * numBlocks_ * (3 + numDims_) +
+                                     numBlocks_ * 1);
+    frameForgetGate_[i].grad->setData(gate_.grad->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * 2);
+    frameOutputGate_[i].grad->setData(gate_.grad->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * (2 + numDims_));
+  }
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+
+    for (size_t i = 0; i < numSequences; i++) {
+      CoordIterator coordIter(dimsV_[i], directions_);
+      backwardOneSequence(starts[i], coordIter);
+    }
+  }
+
+  if (input.grad) {
+    input.grad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, 1);
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void MDLstmLayer::backwardGate2OutputSequence(int start,
+                                              CoordIterator& coordIter) {
+  int idxCurr = start + coordIter.offset();
+  std::vector<int> preOffsetV;
+  std::vector<int> nextOffsetV;
+  preOffsetV.reserve(numDims_);
+  nextOffsetV.reserve(numDims_);
+  for (int i = 0; i < numDims_; i++) {
+    std::vector<int> prePos;
+    if (coordIter.getPrePos(delays_, i, prePos)) {
+      preOffsetV[i] = coordIter.offset(prePos);
+    } else {
+      preOffsetV[i] = -1;
+    }
+    std::vector<int> nextPos;
+    if (coordIter.getNextPos(delays_, i, nextPos)) {
+      nextOffsetV[i] = coordIter.offset(nextPos);
+    } else {
+      nextOffsetV[i] = -1;
+    }
+  }
+
+  framePreOutput_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
+                                        *frameOutputGate_[idxCurr].value);
+  activationState_->backward(framePreOutput_[idxCurr]).check();
+  frameState_[idxCurr].grad->copyFrom(*(framePreOutput_[idxCurr].grad));
+
+  frameOutputGate_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
+                                         *framePreOutput_[idxCurr].value);
+  activationGate_->backward(frameOutputGate_[idxCurr]).check();
+
+  frameState_[idxCurr].grad->addDotMul(
+      *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
+  for (int i = 0; i < numDims_; i++) {
+    if (nextOffsetV[i] >= 0) {
+      frameState_[idxCurr].grad->addDotMul(
+          *frameInputGate_[start + nextOffsetV[i]].grad, *checkIg_, 1.0, 1.0);
+
+      MatrixPtr fgGateOneDimGrad = Matrix::create(
+          frameForgetGate_[start + nextOffsetV[i]].grad->getData() +
+              i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      MatrixPtr fgGateOneDimVal = Matrix::create(
+          frameForgetGate_[start + nextOffsetV[i]].value->getData() +
+              i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      MatrixPtr checkFgOneDim = Matrix::create(
+          checkFg_->getData() + i * numBlocks_, 1, numBlocks_, false, useGpu_);
+
+      frameState_[idxCurr].grad->addDotMul(
+          *fgGateOneDimGrad, *checkFgOneDim, 1.0, 1.0);
+      frameState_[idxCurr].grad->addDotMul(
+          *frameState_[start + nextOffsetV[i]].grad,
+          *fgGateOneDimVal,
+          1.0,
+          1.0);
+    }
+  }
+
+  frameInputNode_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
+                                        *frameInputGate_[idxCurr].value);
+  frameInputGate_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
+                                        *frameInputNode_[idxCurr].value);
+
+  frameForgetGate_[idxCurr].grad->zeroMem();
+  for (int i = 0; i < numDims_; i++) {
+    if (preOffsetV[i] >= 0) {
+      MatrixPtr fgGateOneDimGrad = Matrix::create(
+          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      fgGateOneDimGrad->addDotMul(*frameState_[idxCurr].grad,
+                                  *frameState_[start + preOffsetV[i]].value,
+                                  1.0,
+                                  1.0);
+    }
+  }
+
+  activationGate_->backward(frameInputGate_[idxCurr]).check();
+  activationGate_->backward(frameForgetGate_[idxCurr]).check();
+  activation_->backward(frameInputNode_[idxCurr]).check();
+
+  if (bias_->getWGrad()) {
+    for (int i = 0; i < numDims_; i++) {
+      if (preOffsetV[i] >= 0) {
+        checkIgGrad_->addDotMul(*frameInputGate_[idxCurr].grad,
+                                *frameState_[start + preOffsetV[i]].value,
+                                1.0,
+                                1.0);
+
+        MatrixPtr fgGateOneDimGrad = Matrix::create(
+            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
+            1,
+            numBlocks_,
+            false,
+            useGpu_);
+        MatrixPtr checkFgOneDimGrad =
+            Matrix::create(checkFgGrad_->getData() + i * numBlocks_,
+                           1,
+                           numBlocks_,
+                           false,
+                           useGpu_);
+        checkFgOneDimGrad->addDotMul(*fgGateOneDimGrad,
+                                     *frameState_[start + preOffsetV[i]].value,
+                                     1.0,
+                                     1.0);
+      }
+    }
+    checkOgGrad_->addDotMul(
+        *frameOutputGate_[idxCurr].grad, *frameState_[idxCurr].value, 1.0, 1.0);
+  }
+}
+
+void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) {
+  MatrixPtr weightT = weight_->getW()->getTranspose();
+  for (coordIter.rbegin(); !coordIter.end(); --coordIter) {
+    int offset = coordIter.offset();
+    backwardGate2OutputSequence(start, coordIter);
+    for (int i = 0; i < numDims_; i++) {
+      std::vector<int> prePos;
+      if (coordIter.getPrePos(delays_, i, prePos)) {
+        int preOffset = coordIter.offset(prePos);
+        frameOutput_[start + preOffset].grad->mul(
+            *frameGate_[start + offset].grad, *weightT, 1.0, 1.0);
+        if (weight_->getWGrad()) {
+          weight_->getWGrad()->mul(
+              *frameOutput_[start + preOffset].value->getTranspose(),
+              *frameGate_[start + offset].grad,
+              1.0,
+              1.0);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLDNNAddtoLayer.cpp
rename to paddle/legacy/gserver/layers/MKLDNNAddtoLayer.cpp
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNAddtoLayer.h
rename to paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h
diff --git a/paddle/gserver/layers/MKLDNNBase.h b/paddle/legacy/gserver/layers/MKLDNNBase.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNBase.h
rename to paddle/legacy/gserver/layers/MKLDNNBase.h
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
rename to paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.cpp
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNBatchNormLayer.h
rename to paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNConcatLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLDNNConcatLayer.cpp
rename to paddle/legacy/gserver/layers/MKLDNNConcatLayer.cpp
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.h b/paddle/legacy/gserver/layers/MKLDNNConcatLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNConcatLayer.h
rename to paddle/legacy/gserver/layers/MKLDNNConcatLayer.h
diff --git a/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..01c20d240b5a1f4aab642d6be00689716a015576
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
@@ -0,0 +1,388 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNConvLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_conv, MKLDNNConvLayer);
+
+bool MKLDNNConvLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK(config_.shared_biases()) << "Only support shared biases yet";
+
+  oc_ = config_.num_filters();
+  const ConvConfig& conf = config_.inputs(0).conv_conf();
+  ic_ = conf.channels();
+  fw_ = conf.filter_size();
+  fh_ = conf.filter_size_y();
+  pw_ = conf.padding();
+  ph_ = conf.padding_y();
+  dw_ = conf.dilation();
+  dh_ = conf.dilation_y();
+  sw_ = conf.stride();
+  sh_ = conf.stride_y();
+  gp_ = conf.groups();
+  oh_ = conf.output_y();
+  ow_ = conf.output_x();
+  ih_ = conf.img_size_y();
+  iw_ = conf.img_size();
+  caffeMode_ = conf.caffe_mode();
+  CHECK(caffeMode_) << "Only support caffe mode yet";
+  CHECK(dh_ == 1 && dw_ == 1) << "Only support dilation 1 yet";
+  // check group setting
+  CHECK_EQ((oc_ / gp_) * gp_, oc_) << "group is indivisible for oc";
+  CHECK_EQ((ic_ / gp_) * gp_, ic_) << "group is indivisible for ic";
+
+  // create weight
+  size_t height = oc_ / gp_;
+  size_t width = ic_ * fh_ * fw_;
+  CHECK_EQ(parameters_[0]->getSize(), height * width);
+  weight_ =
+      std::unique_ptr<Weight>(new Weight(height, width, parameters_[0], 0));
+
+  // create biases
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
+  }
+  return true;
+}
+
+void MKLDNNConvLayer::convertWeightsFromPaddle() {
+  if (hasInitedWgt_) {
+    return;
+  }
+
+  CHECK(wgtVal_) << "should have been initialized";
+  // the paddle weight format is oihw or goihw
+  auto targetDim = wgtVal_->getDims();
+  auto srcFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
+  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
+  hasInitedWgt_ = true;
+}
+
+void MKLDNNConvLayer::convertWeightsToPaddle() {
+  CHECK(wgtVal_) << "should have been initialized";
+  auto targetDim = wgtVal_->getDims();
+  auto dstFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
+  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
+}
+
+void MKLDNNConvLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+
+  // cal output sizes
+  // oc can not be changed
+  int fh = (fh_ - 1) * dh_ + 1;
+  int fw = (fw_ - 1) * dw_ + 1;
+  oh = outputSize(ih, fh, ph_, sh_, caffeMode_);
+  ow = outputSize(iw, fw, pw_, sw_, caffeMode_);
+
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  resetFwdPD(fwdPD_);
+
+  resetFwdBuffers(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
+}
+
+void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
+  std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
+
+  resetBwdWgtPD(bwdWgtPD);
+
+  resetBwdDataPD(bwdDataPD);
+
+  resetBwdBuffers(bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
+
+  resetBwdPipeline(
+      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
+}
+
+void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
+  weight_->getParameterPtr()->incUpdate(callback);
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNConvLayer::loadConvSettings(memory::dims& wgt,
+                                       memory::dims& bias,
+                                       memory::dims& stride,
+                                       memory::dims& dilation,
+                                       memory::dims& padL,
+                                       memory::dims& padR) {
+  wgt = (gp_ == 1) ? memory::dims{oc_, ic_, fh_, fw_}
+                   : memory::dims{gp_, oc_ / gp_, ic_ / gp_, fh_, fw_};
+  bias = memory::dims{oc_};
+  stride = memory::dims{sh_, sw_};
+  padL = memory::dims{ph_, pw_};
+  padR = getPaddingR();
+  // note: mkldnn dilation start from 0
+  dilation = memory::dims{dh_ - 1, dw_ - 1};
+}
+
+void MKLDNNConvLayer::resetFwdPD(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd) {
+  // dims for conv
+  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  algorithm algo = algorithm::convolution_direct;
+  padding_kind padKind = padding_kind::zero;
+  conv_fwd::desc fwdDesc =
+      biases_ && biases_->getW()
+          ? conv_fwd::desc(pk,
+                           algo,
+                           MKLDNNMatrix::createMemoryDesc(inDims),
+                           MKLDNNMatrix::createMemoryDesc(wgtDims),
+                           MKLDNNMatrix::createMemoryDesc(biasDims),
+                           MKLDNNMatrix::createMemoryDesc(outDims),
+                           strides,
+                           dilations,
+                           padL,
+                           padR,
+                           padKind)
+          : conv_fwd::desc(pk,
+                           algo,
+                           MKLDNNMatrix::createMemoryDesc(inDims),
+                           MKLDNNMatrix::createMemoryDesc(wgtDims),
+                           MKLDNNMatrix::createMemoryDesc(outDims),
+                           strides,
+                           dilations,
+                           padL,
+                           padR,
+                           padKind);
+  pd.reset(new conv_fwd::primitive_desc(fwdDesc, engine_));
+}
+
+void MKLDNNConvLayer::resetFwdBuffers(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(pd);
+  resetInValue(
+      in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
+
+  resetOutValue(out, pd->dst_primitive_desc());
+
+  resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
+
+  if (biases_ && biases_->getW()) {
+    resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
+  } else {
+    bias = nullptr;
+  }
+}
+
+void MKLDNNConvLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<conv_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  if (bias) {
+    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
+  } else {
+    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
+  }
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNConvLayer::resetBwdWgtPD(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& pd) {
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+
+  // create backward weight using input, output and weight value memory desc
+  CHECK(inVals_[0]) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
+  CHECK(wgtVal_) << "Should have weight value";
+  algorithm algo = algorithm::convolution_direct;
+  padding_kind padKind = padding_kind::zero;
+  auto bwdWgtDesc = biasVal_ != nullptr
+                        ? conv_bwdWgt::desc(algo,
+                                            inVals_[0]->getMemoryDesc(),
+                                            wgtVal_->getMemoryDesc(),
+                                            biasVal_->getMemoryDesc(),
+                                            outVal_->getMemoryDesc(),
+                                            strides,
+                                            padL,
+                                            padR,
+                                            padKind)
+                        : conv_bwdWgt::desc(algo,
+                                            inVals_[0]->getMemoryDesc(),
+                                            wgtVal_->getMemoryDesc(),
+                                            outVal_->getMemoryDesc(),
+                                            strides,
+                                            padL,
+                                            padR,
+                                            padKind);
+  pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[0], pd->src_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(
+      outVal_,
+      pd->diff_dst_primitive_desc(),
+      "primitive desc of out value and grad should be equal");
+  CHECK_PRIMITIVE_DESC_EQ(
+      wgtVal_,
+      pd->diff_weights_primitive_desc(),
+      "primitive desc of weight value and grad should be equal");
+}
+
+void MKLDNNConvLayer::resetBwdDataPD(
+    std::shared_ptr<conv_bwdData::primitive_desc>& pd) {
+  pd = nullptr;
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
+    return;
+  }
+
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+  CHECK(inVals_[0]) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
+  // create backward data using input and output value memory desc
+  // but using weight memory desc with any format
+  auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
+                                        inVals_[0]->getMemoryDesc(),
+                                        MKLDNNMatrix::createMemoryDesc(wgtDims),
+                                        outVal_->getMemoryDesc(),
+                                        strides,
+                                        padL,
+                                        padR,
+                                        padding_kind::zero);
+  pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
+  CHECK_PRIMITIVE_DESC_EQ(
+      inVals_[0],
+      pd->diff_src_primitive_desc(),
+      "primitive desc of in value and grad should be equal");
+  CHECK_PRIMITIVE_DESC_EQ(
+      outVal_,
+      pd->diff_dst_primitive_desc(),
+      "primitive desc of out value and grad should be equal");
+}
+
+void MKLDNNConvLayer::resetBwdBuffers(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(wgtPD);
+  resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
+
+  resetWithMatrix(
+      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(
+      wgtVal_,
+      wgt->getPrimitiveDesc(),
+      "primitive desc of weight grad and value should be equal");
+
+  bias = nullptr;
+  if (biases_ && biases_->getWGrad()) {
+    resetWithMatrix(
+        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
+    CHECK(bias);
+    CHECK_PRIMITIVE_DESC_EQ(
+        biasVal_,
+        bias->getPrimitiveDesc(),
+        "primitive desc of bias grad and value should be equal");
+  }
+
+  if (dataPD == nullptr) {
+    return;
+  }
+  resetInGrad(in, dataPD->diff_src_primitive_desc());
+  resetWgtValBwdData(dataPD, wgtValBwdData_);
+}
+
+void MKLDNNConvLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0]);
+  // add bwdWgt handle
+  if (bias) {
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt, *bias));
+  } else {
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt));
+  }
+  pipeline.push_back(*bwdWgt_);
+
+  if (dataPD == nullptr) {
+    return;
+  }
+  if (cvtWgtVal_) {
+    pipeline.push_back(*cvtWgtVal_);
+  }
+  // add bwdData handle
+  CHECK(wgtValBwdData_) << "Should have weight memory";
+  bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+void MKLDNNConvLayer::resetWgtValBwdData(
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& wgt) {
+  if (dataPD == nullptr) {
+    return;
+  }
+
+  // create new weight value for backward data, and create reorder if necessary
+  // since the primitive_desc would be different with wgtVal_
+  CHECK(wgtVal_) << "should have weight value";
+  if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
+    wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc());
+    cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
+    CHECK(cvtWgtVal_);
+  } else {
+    wgtValBwdData_ = wgtVal_;
+  }
+  VLOG(MKLDNN_FMTS) << "weight value format for backward data: "
+                    << wgtValBwdData_->getFormat();
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/legacy/gserver/layers/MKLDNNConvLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNConvLayer.h
rename to paddle/legacy/gserver/layers/MKLDNNConvLayer.h
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLDNNFcLayer.cpp
rename to paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/legacy/gserver/layers/MKLDNNFcLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNFcLayer.h
rename to paddle/legacy/gserver/layers/MKLDNNFcLayer.h
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLDNNLRNLayer.cpp
rename to paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.h b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNLRNLayer.h
rename to paddle/legacy/gserver/layers/MKLDNNLRNLayer.h
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLDNNLayer.cpp
rename to paddle/legacy/gserver/layers/MKLDNNLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MKLDNNLayer.h b/paddle/legacy/gserver/layers/MKLDNNLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8f292684cdb23af197c6d4dbf023321781b662b
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNLayer.h
@@ -0,0 +1,477 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "MKLDNNBase.h"
+#include "mkldnn.hpp"
+#include "paddle/legacy/math/MKLDNNMatrix.h"
+#include "paddle/utils/Stat.h"
+
+DECLARE_bool(use_mkldnn);
+
+namespace paddle {
+
+class MKLDNNLayer;
+typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
+
+/**
+ * @brief Base class of MKLDNNlayer.
+ *
+ */
+class MKLDNNLayer : public Layer {
+ protected:
+  // batch size
+  int bs_;
+  // their sizes are always from the first input layer
+  // input image channel, height and width
+  int ic_, ih_, iw_;
+  // output image channel, height and width
+  int oc_, oh_, ow_;
+
+  // the condition that forward need be reset
+  size_t condition_;
+  // backward also need reset after reset forward handle
+  bool needResetBwd_;
+
+  // is output only mkldnn
+  bool outputOnlyMKLDNN_;
+
+  // mkldnn engine, stream and primivtives
+  mkldnn::engine engine_;
+  std::shared_ptr<MKLDNNStream> stream_;
+  std::shared_ptr<mkldnn::primitive> fwd_;
+  std::shared_ptr<mkldnn::primitive> bwdWgt_;
+  std::shared_ptr<mkldnn::primitive> bwdData_;
+  std::vector<mkldnn::primitive> pipelineFwd_;
+  std::vector<mkldnn::primitive> pipelineBwd_;
+
+  /* Value and grad are seperated as internal and external buffers.
+   * Each MKLDNNLayer must init or reset internal buffer at least,
+   * and the external buffer format is always nchw of nc(when h==w==1),
+   * which is the same format as paddle.
+   * The output_.value and output_.grad always save the external data,
+   * when mixed with cpu device.
+   * When all layers are mkldnn layers, they could save internal data.
+   */
+  // below MKLDNNMatrix buffers are all internal buffers
+  std::vector<MKLDNNMatrixPtr> inVals_;
+  std::vector<MKLDNNMatrixPtr> inGrads_;
+  MKLDNNMatrixPtr outVal_;
+  MKLDNNMatrixPtr outGrad_;
+  // below are external value and grad
+  std::vector<MKLDNNMatrixPtr> extInVals_;
+  std::vector<MKLDNNMatrixPtr> extInGrads_;
+  MKLDNNMatrixPtr extOutVal_;
+  MKLDNNMatrixPtr extOutGrad_;
+  // convert handle between external and internal buffers
+  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInVals_;
+  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInGrads_;
+  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
+  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
+
+  // weight and bias are always internal buffers
+  MKLDNNMatrixPtr wgtVal_;
+  MKLDNNMatrixPtr wgtGrad_;
+  MKLDNNMatrixPtr biasVal_;
+  MKLDNNMatrixPtr biasGrad_;
+
+  // merge grad primitive
+  std::shared_ptr<mkldnn::primitive> mergeGrad_;
+  std::vector<mkldnn::primitive> pipelineMergeGrad_;
+  // tmp input argument to save input grad, only used to merge grad
+  Argument tmpInArg_;
+
+ public:
+  explicit MKLDNNLayer(const LayerConfig& config)
+      : Layer(config),
+        ih_(0),
+        iw_(0),
+        condition_(0),
+        needResetBwd_(true),
+        outputOnlyMKLDNN_(false),
+        engine_(mkldnn::engine::cpu, 0),
+        stream_(nullptr),
+        fwd_(nullptr),
+        bwdWgt_(nullptr),
+        bwdData_(nullptr) {}
+
+  ~MKLDNNLayer() {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
+
+  /**
+   * reshape the input and output channels and image sizes
+   * and reset output buffer size
+   */
+  virtual void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) = 0;
+
+  /**
+   * reset the mkldnn forward primitve and memories
+   * only would be called when input size changes
+   * weight and bias buffers should be coverd by child class itself
+   */
+  virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out) = 0;
+
+  /**
+   * reset the mkldnn backward primitve and memories
+   * only would be called when needed
+   * weight and bias buffers should be coverd by child class itself
+   */
+  virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out) = 0;
+
+  /**
+   * Update weights and biases if necessary.
+   */
+  virtual void updateWeights(const UpdateCallback& callback) {}
+
+  /**
+   * convert weight from paddle format to mkldnn format
+   * weight_ will be override
+   */
+  virtual void convertWeightsFromPaddle() {}
+
+  /**
+   * convert mkldnn weight to paddle format
+   * weight_ will be override
+   */
+  virtual void convertWeightsToPaddle() {}
+
+  /**
+   * add this interface as public for unit test
+   */
+  void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
+
+ protected:
+  /**
+   * Some layers may have different condition to reset the forward.
+   * The function returns the condition that do not need reset forward.
+   */
+  inline virtual size_t keepCondition() {
+    // reset when the first input element size changed, not only the batchsize
+    return inputLayers_[0]->getOutputValue()->getElementCnt();
+  }
+
+  /**
+   * reshape the input image sizes and input batchsize
+   */
+  void reshapeInput(int& batchsize, int& height, int& width, size_t idx = 0);
+
+  /**
+   * reshape output image sizes
+   */
+  void reshapeOutput(size_t height, size_t width);
+
+  /**
+   * reset MKLDNNMatrix from Matrix and internal primitive desc.
+   * reset nullptr if matrix or primitive desc is empty
+   */
+  void resetWithMatrix(MKLDNNMatrixPtr& dnn,
+                       const MatrixPtr& mat,
+                       mkldnn::memory::primitive_desc pd);
+
+  /**
+   * reset input value from input MKLDNNMatrix and internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   * input channel may be different in concat.
+   */
+  void resetInValue(
+      MKLDNNMatrixPtr& in,
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
+      size_t idx = 0,
+      int inputChannel = 0);
+
+  /**
+   * reset output value from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetOutValue(MKLDNNMatrixPtr& out,
+                     mkldnn::memory::primitive_desc intPD);
+
+  /**
+   * reset input grad from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetInGrad(MKLDNNMatrixPtr& in,
+                   mkldnn::memory::primitive_desc intPD,
+                   size_t idx = 0);
+
+  /**
+   * reset output grad from internal primitive desc.
+   * merge grad if necessary.
+   * reset both internal and external buffer and create reorder if necessary.
+   * note: about merge grad, when this layer has several outputs,
+   *       it could not be mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
+
+  /**
+   * reset the merge grad primitive if necessary.
+   * note: do not support the grads mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  void resetMergeGrad(MKLDNNMatrixPtr& out);
+
+ protected:
+  /**
+   * Set deviceId of this layer.
+   */
+  void setDevice(int id) { deviceId_ = id; }
+
+  /**
+   * check the format is nchw or nc,
+   * which is supported by Paddle default memory layout
+   */
+  bool isPaddleFormat(mkldnn::memory::format fmt) {
+    if (fmt == mkldnn::memory::format::nchw ||
+        fmt == mkldnn::memory::format::nc) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * If input only has MKLDNN device.
+   * Otherwise, only support the previous layer using CPU device.
+   */
+  bool inputIsOnlyMKLDNN(int index = 0) {
+    int prevDevice = getPrev(index)->getDeviceId();
+    if (prevDevice == MKLDNN_DEVICE) {
+      return true;
+    } else {
+      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
+      return false;
+    }
+  }
+
+  /**
+   * If output only has MKLDNN device.
+   * Otherwise, other devices should only using CPU device.
+   */
+  bool outputIsOnlyMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
+    return outputOnlyMKLDNN_;
+  }
+
+  /**
+   * print info about sizes
+   */
+  virtual void printSizeInfo() {
+    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
+                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
+                       << ", oh: " << oh_ << ", ow: " << ow_;
+  }
+
+  /**
+   * print the mkldnn memory format of value
+   */
+  virtual void printValueFormat() {
+    for (size_t i = 0; i < inVals_.size(); ++i) {
+      if (!inVals_[i]) {
+        continue;
+      }
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << (extInVals_[i] ? extInVals_[i]->getFormat()
+                                                  : inVals_[i]->getFormat())
+                        << " >>> " << inVals_[i]->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "
+                        << (extOutVal_ ? extOutVal_->getFormat()
+                                       : outVal_->getFormat());
+    }
+    if (wgtVal_) {
+      VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
+    }
+    if (biasVal_) {
+      VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
+    }
+  }
+
+  /**
+   * print the mkldnn memory format of grad
+   */
+  virtual void printGradFormat() {
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "
+                        << (extOutGrad_ ? extOutGrad_->getFormat()
+                                        : outGrad_->getFormat());
+    }
+    for (size_t i = 0; i < inGrads_.size(); ++i) {
+      if (!inGrads_[i]) {
+        continue;
+      }
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << (extInGrads_[i] ? extInGrads_[i]->getFormat()
+                                                   : inGrads_[i]->getFormat())
+                        << " <<< " << inGrads_[i]->getFormat() << " <<<";
+    }
+    if (wgtGrad_) {
+      VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
+    }
+    if (biasGrad_) {
+      VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
+    }
+  }
+
+ private:
+  /**
+   * clear all grad
+   */
+  void clearGrads() {
+    if (output_.grad) {
+      output_.grad->zeroMem();
+    }
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].grad) {
+        outputOtherDevice_[i].grad->zeroMem();
+      }
+    }
+  }
+
+  /**
+   * Set deviceId of the params used in this layer.
+   */
+  void setParamsDevice(int id, const ParameterMap& parameterMap) {
+    for (auto& inputConfig : config_.inputs()) {
+      if (inputConfig.has_input_parameter_name()) {
+        ParameterPtr parameter;
+        std::string name = inputConfig.input_parameter_name();
+        CHECK(mapGet(name, parameterMap, &parameter))
+            << "Cannot find input parameter " << name << " for layer "
+            << getName();
+        parameter->setDevice(id);
+      }
+    }
+    if (config_.has_bias_parameter_name()) {
+      ParameterPtr parameter;
+      std::string name = config_.bias_parameter_name();
+      CHECK(mapGet(name, parameterMap, &parameter))
+          << "Cannot find bias parameter " << name << " for layer "
+          << getName();
+      parameter->setDevice(id);
+    }
+  }
+
+  /**
+   * Set output map of prev layers.
+   */
+  void setOutputMap() {
+    outputMap_.clear();
+    for (size_t i = 0; i < inputLayers_.size(); ++i) {
+      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
+    }
+  }
+
+  /**
+   * if have cpu device, share value and grad data with output_
+   */
+  void shareCPUDevice() {
+    if (outputIsOnlyMKLDNN()) {
+      return;
+    }
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].value = output_.value;
+      outputOtherDevice_[i].grad = output_.grad;
+    }
+  }
+
+  /**
+   * Check the cpu device number of outputOtherDevice_.
+   * should have only one at most.
+   */
+  void checkCPUOutputsNumber(int max = 1) {
+    int cnt = 0;
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+        ++cnt;
+      }
+    }
+    CHECK_LE(cnt, max) << "too much CPU devies";
+  }
+
+  /**
+   * copy SeqInfo from input layer to this output and other output devices.
+   * @note: do not use getInput(0) since it used this deviceId_,
+   *        use "inputLayers_[0]->getOutput()" instead.
+   */
+  void copySeqInfoToOutputs() {
+    if (inputLayers_.empty() || !needSequenceInfo_) {
+      return;
+    }
+    const Argument& input = inputLayers_[0]->getOutput();
+    output_.sequenceStartPositions = input.sequenceStartPositions;
+    output_.subSequenceStartPositions = input.subSequenceStartPositions;
+    output_.cpuSequenceDims = input.cpuSequenceDims;
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].sequenceStartPositions =
+          output_.sequenceStartPositions;
+      outputOtherDevice_[i].subSequenceStartPositions =
+          output_.subSequenceStartPositions;
+      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
+    }
+  }
+
+  void prepareValueConversions(std::vector<mkldnn::primitive>& pipeline) {
+    // MKLDNNLayer output value should be MKLDNNMatrix
+    // so external output value is necessary.
+    // Then external input value is not necessary,
+    // since input may be mkldnn internal buffer.
+    CHECK(extOutVal_) << "external output value is necessary";
+    output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
+    CHECK(inVals_[0] && outVal_) << "internal memories are necessary";
+    for (size_t i = 0; i < cvtInVals_.size(); ++i) {
+      if (cvtInVals_[i]) {
+        pipeline.insert(pipeline.begin(), *cvtInVals_[i]);
+      }
+    }
+    if (cvtOutVal_) {
+      pipeline.push_back(*cvtOutVal_);
+    }
+  }
+  void prepareGradConversions(std::vector<mkldnn::primitive>& pipeline) {
+    // external output grad is not necessary
+    // since output may be mkldnn internal buffer or merge them directly.
+    CHECK(outGrad_) << "internal output grad is necessary";
+    if (extOutGrad_) {
+      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
+          << "the external buffer should share the same data with output_.grad";
+    }
+    if (cvtOutGrad_) {
+      pipeline.insert(pipeline.begin(), *cvtOutGrad_);
+    }
+    for (size_t i = 0; i < cvtInGrads_.size(); ++i) {
+      if (cvtInGrads_[i]) {
+        pipeline.push_back(*cvtInGrads_[i]);
+      }
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..99c419be88fe65cf93b339b0d90622e656bd09ac
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
@@ -0,0 +1,195 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNPoolLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_pool, MKLDNNPoolLayer);
+
+bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  ic_ = conf.channels();
+  ih_ = conf.img_size_y();
+  iw_ = conf.img_size();
+  oc_ = ic_;
+  oh_ = conf.output_y();
+  ow_ = conf.output_x();
+  fh_ = conf.size_y();
+  fw_ = conf.size_x();
+  ph_ = conf.padding_y();
+  pw_ = conf.padding();
+  sh_ = conf.stride_y();
+  sw_ = conf.stride();
+
+  const std::string& type = conf.pool_type();
+  if (type == "max-projection") {
+    poolAlgo_ = algorithm::pooling_max;
+  } else if (type == "avg-projection") {
+    // paddle only use exclude_padding
+    poolAlgo_ = algorithm::pooling_avg_exclude_padding;
+  } else {
+    LOG(FATAL) << "unknow pooling type!";
+  }
+  return true;
+}
+
+void MKLDNNPoolLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+  // ic_ and oc can not be changed
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
+      << "Input channel can not be changed";
+
+  // cal output sizes
+  // paddle used false caffeMode for pooling
+  oh = outputSize(ih, fh_, ph_, sh_, false);
+  ow = outputSize(iw, fw_, pw_, sw_, false);
+  reshapeOutput(oh, ow);
+
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], out);
+
+  resetFwdPD(fwdPD_, inputs[0], out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
+}
+
+void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  std::shared_ptr<pool_bwd::primitive_desc> pd;
+
+  resetBwdBuffers(inputs[0], out);
+
+  resetBwdPD(pd, inputs[0], out);
+
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
+}
+
+void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                      MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  CHECK(in);
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
+  resetOutValue(out, outPD);
+}
+
+void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
+                                 MKLDNNMatrixPtr in,
+                                 MKLDNNMatrixPtr out) {
+  memory::dims kernels = memory::dims{fh_, fw_};
+  memory::dims strides = memory::dims{sh_, sw_};
+  memory::dims padL = memory::dims{ph_, pw_};
+  memory::dims padR = getPaddingR();
+  padding_kind padKind = padding_kind::zero;
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  auto fwdDesc = pool_fwd::desc(pk,
+                                poolAlgo_,
+                                in->getMemoryDesc(),
+                                out->getMemoryDesc(),
+                                strides,
+                                kernels,
+                                padL,
+                                padR,
+                                padKind);
+  pd.reset(new pool_fwd::primitive_desc(fwdDesc, engine_));
+
+  // prepare workspace if necessary
+  workspace_ =
+      (passType_ != PASS_TEST && poolAlgo_ == algorithm::pooling_max)
+          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
+          : nullptr;
+}
+
+void MKLDNNPoolLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<pool_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  fwd_ = workspace_
+             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
+             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                      MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+}
+
+void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
+                                 MKLDNNMatrixPtr& in,
+                                 MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  memory::dims kernels = memory::dims{fh_, fw_};
+  memory::dims strides = memory::dims{sh_, sw_};
+  memory::dims padL = memory::dims{ph_, pw_};
+  memory::dims padR = getPaddingR();
+  CHECK(out);
+  auto bwdDesc = pool_bwd::desc(poolAlgo_,
+                                in->getMemoryDesc(),
+                                out->getMemoryDesc(),
+                                strides,
+                                kernels,
+                                padL,
+                                padR,
+                                padding_kind::zero);
+  pd.reset(new pool_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNPoolLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<pool_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  if (pd == nullptr) {
+    return;
+  }
+
+  bwdData_ =
+      workspace_
+          ? std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *workspace_, *in))
+          : std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNPoolLayer.h
rename to paddle/legacy/gserver/layers/MKLDNNPoolLayer.h
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
rename to paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.cpp
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLPackedRecurrentLayer.h
rename to paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h
diff --git a/paddle/legacy/gserver/layers/MKLPackedWeight.h b/paddle/legacy/gserver/layers/MKLPackedWeight.h
new file mode 100644
index 0000000000000000000000000000000000000000..47f225bd03c3ccb594db952483d3b8397b61e1ec
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLPackedWeight.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/MathFunctions.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/Weight.h"
+
+namespace paddle {
+
+class MKLPackedWeight {
+ protected:
+  /// The pointer of weight
+  real *weight_;
+  /// The pointer of cblas packed gemm to weight
+  real *packedWeight_;
+  size_t height_;
+  size_t width_;
+  bool transW_;
+
+ public:
+  explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
+    packedWeight_ = nullptr;
+    weight_ = weight->getData();
+    height_ = weight->getHeight();
+    width_ = weight->getWidth();
+    transW_ = transW;
+  }
+
+  ~MKLPackedWeight() { free_(); }
+
+  void pack() { pack_(weight_); }
+
+  void gemm_compute(const MatrixPtr src, MatrixPtr dst) {
+    cblas_sgemm_compute(CblasRowMajor,
+                        CblasNoTrans,
+                        CblasPacked,
+                        src->getHeight(),
+                        transW_ ? height_ : width_,
+                        transW_ ? width_ : height_,
+                        src->getData(),
+                        src->getWidth(),
+                        packedWeight_,
+                        width_,
+                        1.0,
+                        dst->getData(),
+                        dst->getWidth());
+  }
+
+ protected:
+  void pack_(real *src) {
+    if (!packedWeight_) {
+      packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
+    }
+    cblas_sgemm_pack(CblasRowMajor,
+                     CblasBMatrix,
+                     transW_ ? CblasTrans : CblasNoTrans,
+                     1,
+                     transW_ ? height_ : width_,
+                     transW_ ? width_ : height_,
+                     1.0,
+                     src,
+                     width_,
+                     packedWeight_);
+  }
+
+  void free_() {
+    if (packedWeight_) {
+      cblas_sgemm_free(packedWeight_);
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/legacy/gserver/layers/MaxIdLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MaxIdLayer.cpp
rename to paddle/legacy/gserver/layers/MaxIdLayer.cpp
diff --git a/paddle/gserver/layers/MaxLayer.cpp b/paddle/legacy/gserver/layers/MaxLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MaxLayer.cpp
rename to paddle/legacy/gserver/layers/MaxLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MaxLayer.h b/paddle/legacy/gserver/layers/MaxLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b3491cde56aa6897789cb1faad1099859bff12e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MaxLayer.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "SequencePoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * A layer for "internal max" for sequence input.
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = max_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the max pooling operation is
+ *              then applied to each interval independently.
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+
+class MaxLayer : public SequencePoolLayer {
+ protected:
+  // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
+  IVectorPtr maxIndex_;
+
+ public:
+  explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    return SequencePoolLayer::init(layerMap, parameterMap);
+  }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxOutLayer.cpp b/paddle/legacy/gserver/layers/MaxOutLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MaxOutLayer.cpp
rename to paddle/legacy/gserver/layers/MaxOutLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MaxOutLayer.h b/paddle/legacy/gserver/layers/MaxOutLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e56f34b8e02bf1dd48c6b5b6ea135cc1009c25b5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MaxOutLayer.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * A layer to do max out on conv layer output.
+ * Input: output of a conv layer.
+ * Output: feature map size same as input.  Channel is (input channel) / groups.
+ * So the num of channels should be able to devided by groups.
+ *
+ * The config file api is maxout_layer.
+ */
+
+class MaxOutLayer : public Layer {
+ protected:
+  size_t groups_;
+  size_t imgSizeH_, imgSizeW_;
+  /// outputChannels_ = channels_ / groups_
+  size_t channels_, outputChannels_;
+  /// feature length = imgSizeH_ * imgSizeW_
+  size_t featLen_;
+  IVectorPtr maxoutId_;
+
+ public:
+  /// return imgSizeH_ * imgSizeW_ * outputChannels_;
+  size_t getSize();
+
+  explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
+  virtual ~MaxOutLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
rename to paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcd5388abe3f8229dfa418e6917a8a73c93900a7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "PoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief Basic parent layer of different kinds of pooling
+ */
+class MaxPoolWithMaskLayer : public PoolLayer {
+ protected:
+  Argument mask_;
+
+ public:
+  explicit MaxPoolWithMaskLayer(const LayerConfig& config)
+      : PoolLayer(config) {}
+
+  size_t getSize();
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MixedLayer.cpp b/paddle/legacy/gserver/layers/MixedLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MixedLayer.cpp
rename to paddle/legacy/gserver/layers/MixedLayer.cpp
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/legacy/gserver/layers/MixedLayer.h
similarity index 100%
rename from paddle/gserver/layers/MixedLayer.h
rename to paddle/legacy/gserver/layers/MixedLayer.h
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.cpp b/paddle/legacy/gserver/layers/MultiBoxLossLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MultiBoxLossLayer.cpp
rename to paddle/legacy/gserver/layers/MultiBoxLossLayer.cpp
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/legacy/gserver/layers/MultiBoxLossLayer.h
similarity index 100%
rename from paddle/gserver/layers/MultiBoxLossLayer.h
rename to paddle/legacy/gserver/layers/MultiBoxLossLayer.h
diff --git a/paddle/gserver/layers/MultinomialSampler.cpp b/paddle/legacy/gserver/layers/MultinomialSampler.cpp
similarity index 100%
rename from paddle/gserver/layers/MultinomialSampler.cpp
rename to paddle/legacy/gserver/layers/MultinomialSampler.cpp
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/legacy/gserver/layers/MultinomialSampler.h
similarity index 100%
rename from paddle/gserver/layers/MultinomialSampler.h
rename to paddle/legacy/gserver/layers/MultinomialSampler.h
diff --git a/paddle/legacy/gserver/layers/MultiplexLayer.cpp b/paddle/legacy/gserver/layers/MultiplexLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..54a554a1a9f3eb44421fa578604a5ea490ce0fcb
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MultiplexLayer.cpp
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ *@brief This layer multiplex multiple layers according to the index,
+ * which is provided by the first input layer.
+ * - Input[0]: the index of the layer to output of size batchSize.
+ * - Input[1:N]; the candidate output data.
+ * For each index i from 0 to batchSize -1, the output is the i-th row of the
+ * (index[i] + 1)-th layer.
+ *
+ * For each i-th row of output:
+ *
+ * \f[
+ *   y[i][j] = x_{x_{0}[i] + 1}[i][j], j = 0,1, ... , (x_{1}.width - 1)
+ * \f]
+ * where, y is output. \f$x_{k}\f$ is the k-th input layer and
+ * \f$k = x_{0}[i] + 1\f$.
+ */
+
+class MultiplexLayer : public Layer {
+ protected:
+  /**
+   * @brief A struct is used to save the copy information, includes input
+   * layer index and copy size.
+   */
+  struct CopyInfo {
+    CopyInfo(int inStartIdx, int inLength, int inCopyIdx)
+        : startIdx(inStartIdx), length(inLength), copyIdx(inCopyIdx) {}
+
+    /// The start row of input.
+    int startIdx;
+    /// Number of rows. If the layer index in Input[0] is not consecutive,
+    /// the length is one. Otherwise, the length is > 1 and copy multi rows
+    /// once.
+    int length;
+    /// The copied layer index, which needs to add 1.
+    int copyIdx;
+  };
+
+  /// A list of CopyInfo used to save copy information.
+  std::vector<CopyInfo> copySchedule_;
+
+  /// Temporary matrix pointer to point to input data.
+  MatrixPtr tmpSrc_;
+  /// Temporary matrix pointer to point to output data.
+  MatrixPtr tmpDest_;
+
+ public:
+  explicit MultiplexLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~MultiplexLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  /**
+   * @brief Calculate copy info for input layers.
+   */
+  void calculateCopySchedule(const IVectorPtr& copyIds, size_t numIns);
+};
+
+REGISTER_LAYER(multiplex, MultiplexLayer);
+
+void MultiplexLayer::calculateCopySchedule(const IVectorPtr& copyIds,
+                                           size_t numIns) {
+  copySchedule_.clear();
+  CopyInfo prevCopyInfo(0, 0, -1);
+  for (size_t i = 0; i < copyIds->getSize(); i++) {
+    int copyId = copyIds->getElement(i);
+    CHECK_GE(copyId, 0);
+    CHECK_LT(copyId, int(numIns));
+    // copy same input layer with prevous and will copy consecutive.
+    if (copyId == prevCopyInfo.copyIdx) {
+      ++prevCopyInfo.length;
+    } else {
+      if (prevCopyInfo.copyIdx != -1) {
+        copySchedule_.emplace_back(prevCopyInfo);
+      }
+      prevCopyInfo.startIdx = i;
+      prevCopyInfo.length = 1;
+      prevCopyInfo.copyIdx = copyId;
+    }
+  }
+  if (prevCopyInfo.copyIdx != -1) {
+    copySchedule_.emplace_back(prevCopyInfo);
+  }
+}
+
+bool MultiplexLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_GE(inputLayers_.size(), 2U);
+
+  tmpSrc_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  tmpDest_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  return true;
+}
+
+void MultiplexLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  IVectorPtr copyIds = getInput(0).ids;
+  MatrixPtr inV1 = getInputValue(1);
+  CHECK_EQ(copyIds->getSize(), inV1->getHeight());
+  for (size_t i = 2; i < inputLayers_.size(); i++) {
+    CHECK_EQ(inV1->getHeight(), getInputValue(i)->getHeight());
+    CHECK_EQ(inV1->getWidth(), getInputValue(i)->getWidth());
+  }
+
+  calculateCopySchedule(copyIds, inputLayers_.size() - 1);
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(inV1->getHeight(), inV1->getWidth());
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwLMultplexingTimer", getName().c_str());
+    AsyncGpuBlock block;
+    for (const CopyInfo& info : copySchedule_) {
+      outV->subMatrix(info.startIdx, info.length, tmpDest_)
+          ->copyFrom(*getInputValue(info.copyIdx + 1)
+                          ->subMatrix(info.startIdx, info.length, tmpSrc_));
+    }
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MultiplexLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  MatrixPtr outG = getOutputGrad();
+
+  {
+    REGISTER_TIMER_INFO("BwLMultiplexTimer", getName().c_str());
+    AsyncGpuBlock block;
+    for (const CopyInfo& info : copySchedule_) {
+      if (getInputGrad(info.copyIdx + 1)) {
+        getInputGrad(info.copyIdx + 1)
+            ->subMatrix(info.startIdx, info.length, tmpDest_)
+            ->add(*outG->subMatrix(info.startIdx, info.length, tmpSrc_));
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NCELayer.cpp b/paddle/legacy/gserver/layers/NCELayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae4d6408168d1597760fe0094bc04f9cef657da4
--- /dev/null
+++ b/paddle/legacy/gserver/layers/NCELayer.cpp
@@ -0,0 +1,323 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+
+#include "Layer.h"
+#include "MultinomialSampler.h"
+#include "paddle/legacy/math/MathFunctions.h"
+
+namespace paddle {
+
+/**
+ * Noise-contrastive estimation.
+ * Implements the method in the following paper:
+ * A fast and simple algorithm for training neural probabilistic language
+ * models.
+ *
+ * The config file api is nce_layer.
+ */
+class NCELayer : public Layer {
+  int numClasses_;
+  /// number of input layer besides labelLayer and weightLayer
+  int numInputs_;
+  LayerPtr labelLayer_;
+  /// weight layer, can be None
+  LayerPtr weightLayer_;
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+  std::unique_ptr<MultinomialSampler> sampler_;
+
+  std::uniform_int_distribution<int> rand_;
+
+  struct Sample {
+    int sampleId;
+    int labelId;
+    bool target;
+    real weight;
+  };
+  std::vector<Sample> samples_;
+  /// whether samples_ is prepared
+  bool prepared_;
+  Argument sampleOut_;
+
+  IVectorPtr labelIds_;
+
+ public:
+  explicit NCELayer(const LayerConfig& config)
+      : Layer(config),
+        numClasses_(config.num_classes()),
+        rand_(0, config.num_classes() - 1),
+        prepared_(false) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    /* Initialize the basic parent class */
+    Layer::init(layerMap, parameterMap);
+
+    /* initialize the weightList */
+    size_t i;
+    for (i = 0; i < inputLayers_.size(); i++) {
+      if (!parameters_[i]) break;
+      size_t width = inputLayers_[i]->getSize();
+      // create a new weight
+      CHECK_EQ(parameters_[i]->getSize(), width * numClasses_);
+      Weight* w = new Weight(numClasses_, width, parameters_[i]);
+
+      // append the new weight to the list
+      weights_.emplace_back(w);
+    }
+
+    CHECK_EQ(1U, getSize());
+
+    numInputs_ = i;
+    CHECK_GE(numInputs_, 1)
+        << "Must have at least one input besides label and weight";
+    CHECK_LT(i, inputLayers_.size()) << "Missing label layer";
+    labelLayer_ = inputLayers_[i];
+    if (++i < inputLayers_.size()) {
+      weightLayer_ = inputLayers_[i];
+      ++i;
+    }
+    CHECK_EQ(i, inputLayers_.size());
+
+    /* initialize biases_ */
+    if (biasParameter_.get() != NULL) {
+      CHECK_EQ(biasParameter_->getSize(), (size_t)numClasses_);
+      biases_.reset(new Weight(1, numClasses_, biasParameter_));
+    }
+
+    if (config_.neg_sampling_dist_size()) {
+      CHECK_EQ(numClasses_, config_.neg_sampling_dist_size());
+      sampler_.reset(MultinomialSampler::create(
+          config_.neg_sampling_dist().data(), numClasses_));
+    }
+
+    return true;
+  }
+
+  void prepareSamples() {
+    CHECK(!useGpu_) << "GPU is not supported";
+
+    int batchSize = getInput(*labelLayer_).getBatchSize();
+    IVectorPtr label = getInput(*labelLayer_).ids;
+
+    CpuSparseMatrixPtr multiLabel = std::dynamic_pointer_cast<CpuSparseMatrix>(
+        getInput(*labelLayer_).value);
+
+    CHECK(label || multiLabel)
+        << "The label layer must have ids or NonValueSparseMatrix value";
+
+    auto& randEngine = ThreadLocalRandomEngine::get();
+
+    samples_.clear();
+    samples_.reserve(batchSize * (1 + config_.num_neg_samples()));
+
+    real* weight =
+        weightLayer_ ? getInputValue(*weightLayer_)->getData() : nullptr;
+
+    for (int i = 0; i < batchSize; ++i) {
+      real w = weight ? weight[i] : 1;
+      if (label) {
+        int* ids = label->getData();
+        samples_.push_back({i, ids[i], true, w});
+      } else {
+        const int* cols = multiLabel->getRowCols(i);
+        int n = multiLabel->getColNum(i);
+        for (int j = 0; j < n; ++j) {
+          samples_.push_back({i, cols[j], true, w});
+        }
+      }
+      for (int j = 0; j < config_.num_neg_samples(); ++j) {
+        int id = sampler_ ? sampler_->gen(randEngine) : rand_(randEngine);
+        samples_.push_back({i, id, false, w});
+      }
+    }
+    prepared_ = true;
+  }
+
+  void prefetch() override {
+    prepareSamples();
+    IVector::resizeOrCreate(labelIds_, samples_.size(), useGpu_);
+    int* ids = labelIds_->getData();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      ids[i] = samples_[i].labelId;
+    }
+
+    for (int i = 0; i < numInputs_; ++i) {
+      auto sparseParam =
+          dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+      if (sparseParam) {
+        sparseParam->addRows(labelIds_);
+      }
+    }
+  }
+
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+
+    CHECK(!useGpu_) << "GPU is not supported";
+
+    if (!prepared_) {
+      if (passType == PASS_GC) {
+        ThreadLocalRandomEngine::get().seed(ThreadLocalRand::getDefaultSeed());
+      }
+      prepareSamples();
+    }
+    prepared_ = false;
+
+    /* malloc memory for the output_ if necessary */
+    int batchSize = getInputValue(0)->getHeight();
+    int size = getSize();
+    resetOutput(batchSize, size);
+
+    Matrix::resizeOrCreate(sampleOut_.value,
+                           1,
+                           samples_.size(),
+                           /* trans= */ false,
+                           useGpu_);
+
+    forwardBias();
+
+    for (int l = 0; l < numInputs_; ++l) {
+      forwardOneInput(l);
+    }
+
+    auto status = activation_->forward(sampleOut_);
+    status.check();
+
+    forwardCost();
+  }
+
+  void backward(const UpdateCallback& callback) override {
+    Matrix::resizeOrCreate(sampleOut_.grad,
+                           1,
+                           samples_.size(),
+                           /* trans= */ false,
+                           useGpu_);
+
+    backwardCost();
+
+    auto status = activation_->backward(sampleOut_);
+    status.check();
+
+    if (biases_->getWGrad()) {
+      backwardBias(callback);
+    }
+
+    for (int l = 0; l < numInputs_; ++l) {
+      backwardOneInput(l, callback);
+    }
+  }
+
+  void forwardBias() {
+    if (!biases_) {
+      sampleOut_.value->zeroMem();
+    } else {
+      real* bias = biases_->getW()->getData();
+      real* sampleOut = sampleOut_.value->getData();
+      for (size_t i = 0; i < samples_.size(); ++i) {
+        sampleOut[i] = bias[samples_[i].labelId];
+      }
+    }
+  }
+
+  void backwardBias(const UpdateCallback& callback) {
+    if (!biases_) return;
+    real* bias = biases_->getWGrad()->getData();
+    real* sampleOut = sampleOut_.grad->getData();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      bias[samples_[i].labelId] += sampleOut[i];
+    }
+    biases_->incUpdate(callback);
+  }
+
+  void forwardOneInput(int layerId) {
+    const MatrixPtr& inputMat = getInputValue(layerId);
+    const MatrixPtr& weightMat = weights_[layerId]->getW();
+
+    int dim = inputMat->getWidth();
+    real* sampleOut = sampleOut_.value->getData();
+
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      sampleOut[i] += dotProduct(dim,
+                                 inputMat->getRowBuf(samples_[i].sampleId),
+                                 weightMat->getRowBuf(samples_[i].labelId));
+    }
+  }
+
+  void backwardOneInput(int layerId, const UpdateCallback& callback) {
+    const MatrixPtr& inputMat = getInputValue(layerId);
+    const MatrixPtr& inputGradMat = getInputGrad(layerId);
+    const MatrixPtr& weightMat = weights_[layerId]->getW();
+    const MatrixPtr& weightGradMat = weights_[layerId]->getWGrad();
+
+    int dim = inputMat->getWidth();
+    real* sampleGrad = sampleOut_.grad->getData();
+
+    if (weightGradMat) {
+      for (size_t i = 0; i < samples_.size(); ++i) {
+        axpy(dim,
+             sampleGrad[i],
+             inputMat->getRowBuf(samples_[i].sampleId),
+             weightGradMat->getRowBuf(samples_[i].labelId));
+      }
+      weights_[layerId]->incUpdate(callback);
+    }
+
+    if (inputGradMat) {
+      for (size_t i = 0; i < samples_.size(); ++i) {
+        axpy(dim,
+             sampleGrad[i],
+             weightMat->getRowBuf(samples_[i].labelId),
+             inputGradMat->getRowBuf(samples_[i].sampleId));
+      }
+    }
+  }
+
+  void forwardCost() {
+    real* out = output_.value->getData();
+    real* sampleOut = sampleOut_.value->getData();
+    real b = 1. / numClasses_ * config_.num_neg_samples();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      real o = sampleOut[i];
+      if (sampler_) {
+        b = config_.num_neg_samples() *
+            config_.neg_sampling_dist(samples_[i].labelId);
+      }
+      real cost = samples_[i].target ? -log(o / (o + b)) : -log(b / (o + b));
+      out[samples_[i].sampleId] += samples_[i].weight * cost;
+    }
+  }
+
+  void backwardCost() {
+    real* sampleOut = sampleOut_.value->getData();
+    real* sampleGrad = sampleOut_.grad->getData();
+
+    real b = 1. / numClasses_ * config_.num_neg_samples();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      real o = sampleOut[i];
+      if (sampler_) {
+        b = config_.num_neg_samples() *
+            config_.neg_sampling_dist(samples_[i].labelId);
+      }
+      real w = samples_[i].weight;
+      sampleGrad[i] = samples_[i].target ? -w * b / (o * (o + b)) : w / (o + b);
+    }
+  }
+};
+
+REGISTER_LAYER(nce, NCELayer);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/legacy/gserver/layers/NormLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/NormLayer.cpp
rename to paddle/legacy/gserver/layers/NormLayer.cpp
diff --git a/paddle/legacy/gserver/layers/NormLayer.h b/paddle/legacy/gserver/layers/NormLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ac00034d086a5952b30576268c72af326e3ebf9
--- /dev/null
+++ b/paddle/legacy/gserver/layers/NormLayer.h
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "NormLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Basic parent layer of normalization
+ *
+ * @note Normalize the input in local region
+ */
+class NormLayer : public Layer {
+ public:
+  explicit NormLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    Layer::init(layerMap, parameterMap);
+    return true;
+  }
+
+  /**
+   * @brief create norm layer by norm_type
+   */
+  static Layer* create(const LayerConfig& config);
+};
+
+/**
+ * @brief response normalization within feature maps
+ * namely normalize in independent channel
+ * When code refactoring, we delete the original implementation.
+ * Need to implement in the futrue.
+ */
+class ResponseNormLayer : public NormLayer {
+ protected:
+  size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
+  real scale_, pow_;
+  MatrixPtr denoms_;
+
+ public:
+  explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override { LOG(FATAL) << "Not implemented"; }
+  void backward(const UpdateCallback& callback = nullptr) override {
+    LOG(FATAL) << "Not implemented";
+  }
+};
+
+/**
+ * This layer applys normalization across the channels of each sample to a
+ * conv layer's output, and scales the output by a group of trainable factors
+ * whose dimensions equal to the number of channels.
+ * - Input: One and only one input layer are accepted.
+ * - Output: The normalized data of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+class CrossChannelNormLayer : public NormLayer {
+ public:
+  explicit CrossChannelNormLayer(const LayerConfig& config)
+      : NormLayer(config) {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+  MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+  MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+
+ protected:
+  size_t channels_;
+  std::unique_ptr<Weight> scale_;
+  MatrixPtr scaleDiff_;
+  MatrixPtr normBuffer_;
+  MatrixPtr dataBuffer_;
+  MatrixPtr channelBuffer_;
+  MatrixPtr spatialBuffer_;
+  MatrixPtr sampleBuffer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/legacy/gserver/layers/NormProjectionLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/NormProjectionLayer.cpp
rename to paddle/legacy/gserver/layers/NormProjectionLayer.cpp
diff --git a/paddle/legacy/gserver/layers/NormProjectionLayer.h b/paddle/legacy/gserver/layers/NormProjectionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..492d1fcb72343a54577a459aaa5de53596f43f42
--- /dev/null
+++ b/paddle/legacy/gserver/layers/NormProjectionLayer.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "NormLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief response normalization across feature maps
+ * namely normalize in number of size_ channels
+ */
+class CMRProjectionNormLayer : public ResponseNormLayer {
+  size_t imgSizeH_, imgSizeW_;
+  size_t outputH_, outputW_;
+
+ public:
+  explicit CMRProjectionNormLayer(const LayerConfig& config)
+      : ResponseNormLayer(config) {}
+
+  ~CMRProjectionNormLayer() {}
+
+  size_t getSize();
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  TensorShape shape_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Operator.cpp b/paddle/legacy/gserver/layers/Operator.cpp
similarity index 100%
rename from paddle/gserver/layers/Operator.cpp
rename to paddle/legacy/gserver/layers/Operator.cpp
diff --git a/paddle/legacy/gserver/layers/Operator.h b/paddle/legacy/gserver/layers/Operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..20a248985eb6b3aba016b28bca4c0eea44baa868
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Operator.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/parameter/Parameter.h"
+
+#include "Layer.h"
+#include "paddle/legacy/parameter/Argument.h"
+
+namespace paddle {
+
+// Macro for registering a operator type
+// Example: REGISTER_OPERATOR(dot_mul, DotMulOperator);
+#define REGISTER_OPERATOR(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {               \
+    Operator::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+
+/**
+ * Operator like Projection, but takes more than one Arguments as input.
+ * @note: Operator can't have parameters.
+ */
+class Operator {
+ public:
+  static Operator* create(const OperatorConfig& config, bool useGpu);
+
+  Operator(const OperatorConfig& config, bool useGpu)
+      : config_(config), useGpu_(useGpu) {}
+
+  virtual ~Operator() {}
+
+  const OperatorConfig& getConfig() const { return config_; }
+
+  static ClassRegistrar<Operator, OperatorConfig, bool> registrar_;
+
+  /**
+   * Forward propagation. If backward() will be called, in and out must be kept
+   * valid until then.
+   * @param ins inputs of operator
+   * @param out output of operator
+   * @param passType PASS_TRAIN of PASS_TEST
+   */
+  void forward(std::vector<const Argument*> ins,
+               Argument* out,
+               PassType passType) {
+    ins_ = ins;
+    out_ = out;
+    passType_ = passType;
+    forward();
+  }
+
+  virtual void prefetch(const Argument* in) {}
+  virtual void forward() = 0;
+  virtual void backward() = 0;
+
+  /**
+   * See comment in Layer.h for the function with the same name.
+   */
+  virtual void resetState() {}
+
+  /**
+   * Set layer state.
+   */
+  virtual void setState(LayerStatePtr state) {}
+
+  /**
+   * Set layer state.
+   */
+  virtual LayerStatePtr getState() { return nullptr; }
+
+ protected:
+  /// Config of operator
+  OperatorConfig config_;
+  bool useGpu_;
+
+  /// Store `ins` passed to forward()
+  std::vector<const Argument*> ins_;
+  /// Store `out` passed to forward()
+  Argument* out_;
+  /// Store `passType` passed to forward()
+  PassType passType_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/OuterProdLayer.cpp b/paddle/legacy/gserver/layers/OuterProdLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7988560d5aa271d444f158e656bbc460152b2590
--- /dev/null
+++ b/paddle/legacy/gserver/layers/OuterProdLayer.cpp
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for computing the outer product of two vectors
+ * @note used in NEURAL TURING MACHINE
+ * Input1: vector (batchSize * dim1)
+ * Input2: vector (batchSize * dim2)
+ * Output: a matrix: (batchSize * (dim1*dim2))
+ */
+
+class OuterProdLayer : public Layer {
+ protected:
+  MatrixPtr tmpMtx0;
+  MatrixPtr tmpRow0;
+  MatrixPtr tmpRow1;
+
+ public:
+  explicit OuterProdLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~OuterProdLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(out_prod, OuterProdLayer);
+
+bool OuterProdLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  size_t dim0 = inputLayers_[0]->getSize();
+  size_t dim1 = inputLayers_[1]->getSize();
+
+  CHECK_EQ(dim0 * dim1, getSize()) << "Dimension mismatch";
+
+  tmpRow0 = Matrix::create(
+      nullptr, /* height= */ 1, dim0, /* trans= */ false, useGpu_);
+  tmpRow1 = Matrix::create(
+      nullptr, /* height= */ 1, dim1, /* trans= */ false, useGpu_);
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ dim0,
+                           dim1,
+                           /* trans= */ false,
+                           useGpu_);
+  return true;
+}
+
+void OuterProdLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t dim0 = inV0->getWidth();
+  size_t dim1 = inV1->getWidth();
+
+  CHECK_EQ(dim0 * dim1, getSize());
+  CHECK_EQ(inV1->getHeight(), batchSize);
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, dim0 * dim1);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  {
+    REGISTER_TIMER_INFO("FwOutProdTimer", getName().c_str());
+    for (size_t i = 0; i < batchSize; i++) {
+      tmpMtx0->setData(outV->getData() + i * dim0 * dim1);
+      tmpRow0->setData(inV0->getData() + i * dim0);
+      tmpRow1->setData(inV1->getData() + i * dim1);
+
+      tmpMtx0->mul(*tmpRow0->getTranspose(), *tmpRow1);
+    }
+  }
+}
+
+void OuterProdLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t dim0 = inV0->getWidth();
+  size_t dim1 = inV1->getWidth();
+
+  {
+    REGISTER_TIMER_INFO("BwOutProdTimer", getName().c_str());
+
+    if (inG0) {
+      for (size_t i = 0; i < batchSize; i++) {
+        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
+        tmpRow0->setData(inG0->getData() + i * dim0);
+        tmpRow1->setData(inV1->getData() + i * dim1);
+
+        tmpRow0->mul(*tmpRow1, *tmpMtx0->getTranspose(), 1, 1);
+      }
+    }
+
+    if (inG1) {
+      for (size_t i = 0; i < batchSize; i++) {
+        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
+        tmpRow0->setData(inV0->getData() + i * dim0);
+        tmpRow1->setData(inG1->getData() + i * dim1);
+
+        tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 1);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PadLayer.cpp b/paddle/legacy/gserver/layers/PadLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/PadLayer.cpp
rename to paddle/legacy/gserver/layers/PadLayer.cpp
diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/legacy/gserver/layers/PadLayer.h
similarity index 100%
rename from paddle/gserver/layers/PadLayer.h
rename to paddle/legacy/gserver/layers/PadLayer.h
diff --git a/paddle/gserver/layers/ParameterReluLayer.cpp b/paddle/legacy/gserver/layers/ParameterReluLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/ParameterReluLayer.cpp
rename to paddle/legacy/gserver/layers/ParameterReluLayer.cpp
diff --git a/paddle/legacy/gserver/layers/ParameterReluLayer.h b/paddle/legacy/gserver/layers/ParameterReluLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4abd7af75547ca2108a31552904fe6e83dcd8f1
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ParameterReluLayer.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ *  @brief ParameterReluLayer active inputs with learnable parameter weight_.
+ *  forward:
+ *  \f[
+ *      y = x > 0 ? x : w .* x
+ *  \f]
+ *  backward:
+ *  \f[
+ *      dx = x > 0 ? dy : w .* dy \\
+ *      dw = x > 0 ? 0 : dy.*x
+ *  \f]
+ *  Here, x is the input, w is the weight, y is the output.
+ *  dx, dw, dy is the gradient.
+ */
+
+class ParameterReluLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> weight_;
+
+  /**
+   *  @brief partialSum_ makes a group of inputs share same weights,
+   *  - partialSum_ = 1:
+   *       element wise activation: each element has a weight_,
+   *  - partialSum_ = number of elements in one channel,
+   *       channels wise parameter activation, elements in a channel
+   *       share same weight_,
+   *  - partialSum_ = number of outputs
+   *       all elements share same weight_,
+   */
+  size_t partialSum_;
+
+ public:
+  explicit ParameterReluLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ParameterReluLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Pool3DLayer.cpp b/paddle/legacy/gserver/layers/Pool3DLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/Pool3DLayer.cpp
rename to paddle/legacy/gserver/layers/Pool3DLayer.cpp
diff --git a/paddle/legacy/gserver/layers/Pool3DLayer.h b/paddle/legacy/gserver/layers/Pool3DLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6851c44ab22a39bebe3592b8e5f6384a393947f2
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Pool3DLayer.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Basic parent layer of pooling
+ * Pools the input within regions
+ */
+class Pool3DLayer : public Layer {
+ public:
+  explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
+  ~Pool3DLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+  size_t getSize();
+
+ protected:
+  int channels_;
+  int sizeX_, sizeY_, sizeZ_;
+  int strideW_, strideH_, strideD_;
+  int paddingW_, paddingH_, paddingD_;
+  int imgSizeW_, imgSizeH_, imgSizeD_;
+  int outputW_, outputH_, outputD_;
+  std::string poolType_;
+  MatrixPtr maxPoolIdx_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/legacy/gserver/layers/PoolLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/PoolLayer.cpp
rename to paddle/legacy/gserver/layers/PoolLayer.cpp
diff --git a/paddle/legacy/gserver/layers/PoolLayer.h b/paddle/legacy/gserver/layers/PoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..0808dfae8497008f974730b65977c85e914a7a27
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PoolLayer.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Basic parent layer of pooling
+ * Pools the input within regions
+ */
+class PoolLayer : public Layer {
+ protected:
+  size_t channels_, sizeX_, stride_, outputX_, imgSize_;
+  int confPadding_;
+
+  size_t sizeY_;
+  size_t imgSizeY_;
+  size_t strideY_;
+  size_t outputY_;
+  int confPaddingY_;
+
+  std::string poolType_;
+
+  bool excludeMode_;
+
+ public:
+  explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  /**
+   * @brief create pooling layer by pool_type
+   */
+  static Layer* create(const LayerConfig& config);
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/legacy/gserver/layers/PoolProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/PoolProjection.cpp
rename to paddle/legacy/gserver/layers/PoolProjection.cpp
diff --git a/paddle/legacy/gserver/layers/PoolProjection.h b/paddle/legacy/gserver/layers/PoolProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..d01b6a13f0a5fd2283f1f216ef419b9ccc7308f9
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PoolProjection.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Projection.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+namespace paddle {
+
+class PoolProjection : public Projection {
+ protected:
+  size_t imgSizeY_, imgSize_;
+  size_t outputY_, outputX_;
+  size_t strideY_, stride_;
+  size_t sizeY_, sizeX_;
+  int confPaddingY_, confPadding_;
+  size_t channels_;
+  std::string poolType_;
+  bool excludeMode_;
+
+ public:
+  PoolProjection(const ProjectionConfig& config,
+                 ParameterPtr parameter,
+                 bool useGpu);
+
+  static PoolProjection* create(const ProjectionConfig& config,
+                                ParameterPtr parameter,
+                                bool useGpu);
+
+  const std::string& getPoolType() const { return poolType_; }
+
+  size_t getSize();
+};
+
+class MaxPoolProjection : public PoolProjection {
+ public:
+  MaxPoolProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
+                    bool useGpu)
+      : PoolProjection(config, parameter, useGpu) {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+
+class AvgPoolProjection : public PoolProjection {
+ public:
+  AvgPoolProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
+                    bool useGpu)
+      : PoolProjection(config, parameter, useGpu) {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.cpp b/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/PoolProjectionLayer.cpp
rename to paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
diff --git a/paddle/legacy/gserver/layers/PoolProjectionLayer.h b/paddle/legacy/gserver/layers/PoolProjectionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcd35bbba4dff612fba827cdf545de71127c560e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PoolProjectionLayer.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "PoolLayer.h"
+#include "PoolProjection.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief Basic parent layer of different kinds of pooling
+ */
+class PoolProjectionLayer : public PoolLayer {
+ protected:
+  size_t imgSizeH_, imgSizeW_;
+  size_t outputH_, outputW_;
+  std::unique_ptr<PoolProjection> poolProjection_;
+  ProjectionConfig projectionConfig_;
+
+ public:
+  explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {
+    PoolConfig* conf = projectionConfig_.mutable_pool_conf();
+    *conf = config_.inputs(0).pool_conf();
+    poolProjection_.reset(
+        PoolProjection::create(projectionConfig_, nullptr, useGpu_));
+  }
+
+  size_t getSize();
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PowerLayer.cpp b/paddle/legacy/gserver/layers/PowerLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..26a57fcfdd6e8b746b6ba67bd8c7fb674c4cc796
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PowerLayer.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * This layer applys a power function to a vector element-wise,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   y = x^w
+ * \f]
+ * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight,
+ * and output \f$y\f$ is a vector.
+ *
+ * The config file api is power_layer.
+ */
+
+class PowerLayer : public Layer {
+ protected:
+  MatrixPtr tmpMtx;
+
+ public:
+  explicit PowerLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~PowerLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(power, PowerLayer);
+
+bool PowerLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  return true;
+}
+
+void PowerLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  CHECK_EQ(getSize(), dataDim);
+  CHECK_EQ(1U, inV0->getWidth());
+  CHECK_EQ(batchSize, inV0->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  {
+    REGISTER_TIMER_INFO("FwPowerTimer", getName().c_str());
+    outV->rowPow(0, *inV1, *inV0);
+  }
+}
+
+void PowerLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  {
+    REGISTER_TIMER_INFO("BwPowerTimer", getName().c_str());
+    Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
+
+    if (inG0) {
+      tmpMtx->log2(*inV1);
+      tmpMtx->dotMul(*tmpMtx, *outV);
+
+      // inG0 += outG .* (log(inV1) * outV)
+      inG0->rowDotMul(0, *outG, *tmpMtx);
+    }
+
+    if (inG1) {
+      // tmp = (outV / inV1) * inV0
+      tmpMtx->dotDiv(*outV, *inV1);
+      tmpMtx->rowScale(0, *tmpMtx, *inV0);
+
+      inG1->addDotMul(*outG, *tmpMtx, 1, 1);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/legacy/gserver/layers/PrintLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/PrintLayer.cpp
rename to paddle/legacy/gserver/layers/PrintLayer.cpp
diff --git a/paddle/legacy/gserver/layers/PriorBox.cpp b/paddle/legacy/gserver/layers/PriorBox.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..83aab6e36662855a5867463757bc5a92e6e83e07
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PriorBox.cpp
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief A layer for generating priorbox locations and variances.
+ * - Input: Two and only two input layer are accepted. The input layer must be
+ *          be a data output layer and a convolution output layer.
+ * - Output: The priorbox locations and variances of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+
+class PriorBoxLayer : public Layer {
+ public:  // NOLINT
+  explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override {}
+
+ protected:  // NOLINT
+  int numPriors_;
+  std::vector<int> minSize_;
+  std::vector<int> maxSize_;
+  std::vector<real> aspectRatio_;
+  std::vector<real> variance_;
+  MatrixPtr buffer_;
+};
+
+REGISTER_LAYER(priorbox, PriorBoxLayer);
+
+bool PriorBoxLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  auto pbConf = config_.inputs(0).priorbox_conf();
+  std::vector<real> tmp;
+  aspectRatio_.push_back(1.);
+  std::copy(pbConf.min_size().begin(),
+            pbConf.min_size().end(),
+            std::back_inserter(minSize_));
+  std::copy(pbConf.max_size().begin(),
+            pbConf.max_size().end(),
+            std::back_inserter(maxSize_));
+  std::copy(pbConf.variance().begin(),
+            pbConf.variance().end(),
+            std::back_inserter(variance_));
+  std::copy(pbConf.aspect_ratio().begin(),
+            pbConf.aspect_ratio().end(),
+            std::back_inserter(tmp));
+
+  if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());
+
+  // flip aspect ratios
+  for (unsigned index = 0; index < tmp.size(); index++) {
+    real ar = tmp[index];
+    if (fabs(ar - 1.) < 1e-6) continue;
+    aspectRatio_.push_back(ar);
+    aspectRatio_.push_back(1. / ar);
+  }
+
+  numPriors_ = aspectRatio_.size() * minSize_.size() + maxSize_.size();
+
+  return true;
+}
+
+void PriorBoxLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  auto input = getInput(0);
+  int layerWidth = input.getFrameWidth();
+  int layerHeight = input.getFrameHeight();
+
+  auto image = getInput(1);
+  int imageWidth = image.getFrameWidth();
+  int imageHeight = image.getFrameHeight();
+
+  real stepW = static_cast<real>(imageWidth) / layerWidth;
+  real stepH = static_cast<real>(imageHeight) / layerHeight;
+  int dim = layerHeight * layerWidth * numPriors_ * 4;
+  reserveOutput(1, dim * 2);
+  // use a cpu buffer to compute
+  Matrix::resizeOrCreate(buffer_, 1, dim * 2, false, false);
+  auto* tmpPtr = buffer_->getData();
+
+  int idx = 0;
+  for (int h = 0; h < layerHeight; ++h) {
+    for (int w = 0; w < layerWidth; ++w) {
+      real centerX = (w + 0.5) * stepW;
+      real centerY = (h + 0.5) * stepH;
+      for (size_t s = 0; s < minSize_.size(); s++) {
+        real minSize = minSize_[s];
+        real boxWidth = minSize;
+        real boxHeight = minSize;
+
+        // first prior: aspect_ratio == 1.0, compatible to old logic
+        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+        tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+        tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+        // set the variance.
+        for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+
+        if (maxSize_.size() > 0) {
+          // square prior with size sqrt(minSize * maxSize)
+          real maxSize = maxSize_[s];
+          boxWidth = boxHeight = sqrt(minSize * maxSize);
+          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+          // set the variance.
+          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+        }
+
+        // priors with different aspect ratios
+        for (size_t r = 0; r < aspectRatio_.size(); r++) {
+          real ar = aspectRatio_[r];
+          if (fabs(ar - 1.0) < 1e-6) {
+            continue;
+          }
+          boxWidth = minSize * sqrt(ar);
+          boxHeight = minSize / sqrt(ar);
+          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+          // set the variance.
+          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+        }
+      }
+    }
+  }
+
+  // clip the prior's coordidate such that it is within [0, 1]
+  for (int d = 0; d < dim * 2; ++d)
+    if ((d % 8) < 4)
+      tmpPtr[d] = std::min(std::max(tmpPtr[d], (real)0.), (real)1.);
+  MatrixPtr outV = getOutputValue();
+  outV->copyFrom(buffer_->data_, dim * 2);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Projection.cpp b/paddle/legacy/gserver/layers/Projection.cpp
similarity index 100%
rename from paddle/gserver/layers/Projection.cpp
rename to paddle/legacy/gserver/layers/Projection.cpp
diff --git a/paddle/legacy/gserver/layers/Projection.h b/paddle/legacy/gserver/layers/Projection.h
new file mode 100644
index 0000000000000000000000000000000000000000..974f5a2cacd10a965adcb4accf6ca00c26044b64
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Projection.h
@@ -0,0 +1,140 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/parameter/Parameter.h"
+
+namespace paddle {
+
+// Macro for registering a projection type
+// Example: REGISTER_LAYER(fc, FullMatrixProjection);
+#define REGISTER_PROJECTION(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                 \
+    Projection::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+
+#define REGISTER_PROJECTION_CREATE_FUNC(__type_name, createFunction)    \
+  static InitFunction __reg_type_##__type_name([]() {                   \
+    Projection::registrar_.registerClass(#__type_name, createFunction); \
+  })
+
+/**
+ * A projection takes one Argument as input, calculate the result and add it
+ * to output Argument.
+ */
+class Projection {
+ public:
+  static Projection* create(const ProjectionConfig& config,
+                            ParameterPtr parameter,
+                            bool useGpu);
+
+  Projection(const ProjectionConfig& config,
+             ParameterPtr parameter,
+             bool useGpu)
+      : config_(config), parameter_(parameter), useGpu_(useGpu) {}
+
+  virtual ~Projection() {}
+
+  const std::string& getName() const { return config_.name(); }
+
+  /// Register a projection
+  static ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
+      registrar_;
+
+  /**
+   * Forward propagation. If backward() will be called, in and out must be kept
+   * valid until then.
+   * @param in input of projection
+   * @param out output of projection
+   * @param passType PASS_TRAIN of PASS_TEST
+   */
+  void forward(const Argument* in, const Argument* out, PassType passType) {
+    in_ = in;
+    out_ = out;
+    passType_ = passType;
+    forward();
+  }
+
+  virtual void prefetch(const Argument* in) {}
+  virtual void forward() = 0;
+  virtual void backward(const UpdateCallback& callback) = 0;
+
+  /**
+   * See comment in Layer.h for the function with the same name.
+   */
+  virtual void resetState() {}
+
+  /**
+   * Set layer state.
+   */
+  virtual void setState(LayerStatePtr state) {}
+
+  /**
+   * Get layer state. A copy of internal state is returned.
+   */
+  virtual LayerStatePtr getState() { return nullptr; }
+
+  /**
+   * init forward_ and backward_ functions
+   */
+  virtual bool init() { return true; }
+
+  /**
+   * Get output size of projection.
+   */
+  size_t getOutputSize() const { return config_.output_size(); }
+
+ protected:
+  /**
+   * Create layer function. Function is called in forward or backward.
+   * \param function, Layer::forward_ or Layer::backward_
+   * \param name, function name
+   * \param config, initialization configuration for the function
+   */
+  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
+                      const std::string& name,
+                      const FuncConfig& config) {
+    if (useGpu_) {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
+    } else {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
+    }
+    auto& func = function.back();
+    func->init(config);
+  }
+
+ protected:
+  /// Config of projection
+  ProjectionConfig config_;
+  /// Parameter of projection
+  ParameterPtr parameter_;
+  bool useGpu_;
+
+  /// Store `in` passed to forward()
+  const Argument* in_;
+  /// Store `out` passed to forward()
+  const Argument* out_;
+  /// Store `passType` passed to forward()
+  PassType passType_;
+  /// Layer forward function
+  std::vector<std::shared_ptr<FunctionBase>> forward_;
+  /// Layer backward function
+  std::vector<std::shared_ptr<FunctionBase>> backward_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/legacy/gserver/layers/ROIPoolLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/ROIPoolLayer.cpp
rename to paddle/legacy/gserver/layers/ROIPoolLayer.cpp
diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/legacy/gserver/layers/ROIPoolLayer.h
similarity index 100%
rename from paddle/gserver/layers/ROIPoolLayer.h
rename to paddle/legacy/gserver/layers/ROIPoolLayer.h
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/legacy/gserver/layers/RecurrentLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/RecurrentLayer.cpp
rename to paddle/legacy/gserver/layers/RecurrentLayer.cpp
diff --git a/paddle/gserver/layers/RecurrentLayer.h b/paddle/legacy/gserver/layers/RecurrentLayer.h
similarity index 100%
rename from paddle/gserver/layers/RecurrentLayer.h
rename to paddle/legacy/gserver/layers/RecurrentLayer.h
diff --git a/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp b/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f121bdb4ab39e0d618d536b75e8c47d4520fe0b
--- /dev/null
+++ b/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <functional>
+#include "paddle/legacy/gserver/layers/Layer.h"
+
+#include "paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * Recurrent layer group is a group of layers, which forward/backward one frame
+ * after previous frame forward/backward through all layers in layer group.
+ * It's automatically added by config_parser if some layers are defined
+ * between RecurrentLayerGroupBegin and RecurrentLayerGroupEnd.
+ */
+class RecurrentLayerGroup : public Layer {
+ public:
+  explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
+
+  void initSubNetwork(NeuralNetwork* rootNetwork,
+                      const ModelConfig& config,
+                      const std::vector<ParameterType>& parameterTypes,
+                      bool useGpu) override;
+
+  void forward(PassType passType) override {
+    REGISTER_TIMER_INFO("RecurrentGroupFwTime", getName().c_str());
+    const std::vector<Argument> inArgs;
+    std::vector<Argument> outArgs;
+    network_->forward(inArgs, &outArgs, passType);
+  }
+  void backward(const UpdateCallback& callback) override {
+    REGISTER_TIMER_INFO("RecurrentGroupBwTime", getName().c_str());
+    network_->backward(nullptr);
+
+    for (auto& para : parameters_) {
+      para->incUpdate(callback);
+    }
+  }
+
+  /**
+   * @see Layer.accessSubNetwork
+   */
+  void accessSubNetwork(
+      const std::function<void(NeuralNetwork&)>& callback) override {
+    callback(*network_);
+  }
+
+ private:
+  std::unique_ptr<RecurrentGradientMachine> network_;
+};
+
+REGISTER_LAYER(recurrent_layer_group, RecurrentLayerGroup);
+
+void RecurrentLayerGroup::initSubNetwork(
+    NeuralNetwork* rootNetwork,
+    const ModelConfig& config,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
+  setNeedGradient(true);
+
+  network_.reset(new RecurrentGradientMachine(config_.name(), rootNetwork));
+  ParamInitCallback cb = [rootNetwork](int paramId, Parameter* para) {
+    para->enableSharedType(
+        PARAMETER_VALUE,
+        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_VALUE),
+        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_VALUE));
+    para->enableSharedType(
+        PARAMETER_GRADIENT,
+        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_GRADIENT),
+        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_GRADIENT));
+  };
+  network_->init(config, cb, parameterTypes, useGpu);
+
+  for (auto paramId : network_->getParameterIds()) {
+    ParameterPtr parameter = rootNetwork->getParameters()[paramId];
+    parameter->incShared();
+    CHECK_EQ(parameter->getDeviceId(), getDeviceId());
+    parameters_.push_back(parameter);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ResizeLayer.cpp b/paddle/legacy/gserver/layers/ResizeLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f8aad820f7d6d2be0af74d607d763912c3c0f2a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ResizeLayer.cpp
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief A layer for resizing a minibatch matrix h*w to h'*w'
+ * @note
+ * origin matrix height * width)
+ * resize matrix: (height * width / size) * size
+ */
+class ResizeLayer : public Layer {
+ public:
+  explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+};
+
+REGISTER_LAYER(resize, ResizeLayer);
+
+bool ResizeLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void ResizeLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const Argument& input = getInput(0);
+  size_t height = input.value->getHeight();
+  size_t width = input.value->getWidth();
+  CHECK_EQ((height * width) % getSize(), 0UL);
+
+  reserveOutput(height * width / getSize(), getSize());
+  MatrixPtr tmp =
+      Matrix::create(output_.value->getData(), height, width, false, useGpu_);
+  tmp->assign(*input.value);
+}
+
+void ResizeLayer::backward(const UpdateCallback& callback) {
+  const Argument& input = getInput(0);
+  size_t height = input.value->getHeight();
+  size_t width = input.value->getWidth();
+
+  if (!input.grad) {
+    return;
+  }
+
+  MatrixPtr tmp = Matrix::create(input.grad->getData(),
+                                 height * width / getSize(),
+                                 getSize(),
+                                 false,
+                                 useGpu_);
+  tmp->add(*output_.grad);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RotateLayer.cpp b/paddle/legacy/gserver/layers/RotateLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/RotateLayer.cpp
rename to paddle/legacy/gserver/layers/RotateLayer.cpp
diff --git a/paddle/legacy/gserver/layers/RotateLayer.h b/paddle/legacy/gserver/layers/RotateLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..498e24372b8ca17c21ebecbe6a8c8b40217ab259
--- /dev/null
+++ b/paddle/legacy/gserver/layers/RotateLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A layer for rotating a multi-channel feature map (M x N x C) in the spatial
+ * domain
+ * The rotation is 90 degrees in clock-wise for each channel
+ * \f[
+ *   y(j,i,:) = x(M-i-1,j,:)
+ * \f]
+ * where \f$x\f$ is (M x N x C) input, and \f$y\f$ is (N x M x C) output.
+ *
+ * The config file api is rotate_layer
+ *
+ */
+
+class RotateLayer : public Layer {
+ public:
+  explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+
+ private:
+  int batchSize_;
+  int size_;
+  int height_;
+  int width_;
+  int channels_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RowConvLayer.cpp b/paddle/legacy/gserver/layers/RowConvLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/RowConvLayer.cpp
rename to paddle/legacy/gserver/layers/RowConvLayer.cpp
diff --git a/paddle/gserver/layers/RowConvLayer.h b/paddle/legacy/gserver/layers/RowConvLayer.h
similarity index 100%
rename from paddle/gserver/layers/RowConvLayer.h
rename to paddle/legacy/gserver/layers/RowConvLayer.h
diff --git a/paddle/gserver/layers/RowL2NormLayer.cpp b/paddle/legacy/gserver/layers/RowL2NormLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/RowL2NormLayer.cpp
rename to paddle/legacy/gserver/layers/RowL2NormLayer.cpp
diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/legacy/gserver/layers/SamplingIdLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/SamplingIdLayer.cpp
rename to paddle/legacy/gserver/layers/SamplingIdLayer.cpp
diff --git a/paddle/gserver/layers/ScaleShiftLayer.cpp b/paddle/legacy/gserver/layers/ScaleShiftLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/ScaleShiftLayer.cpp
rename to paddle/legacy/gserver/layers/ScaleShiftLayer.cpp
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/ScaleSubRegionLayer.cpp
rename to paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.h b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.h
similarity index 100%
rename from paddle/gserver/layers/ScaleSubRegionLayer.h
rename to paddle/legacy/gserver/layers/ScaleSubRegionLayer.h
diff --git a/paddle/legacy/gserver/layers/ScalingLayer.cpp b/paddle/legacy/gserver/layers/ScalingLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e68ff8905ee9e9965437addd2ef583c2d8b279e8
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ScalingLayer.cpp
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for each row of a matrix, multiplying with a element of a vector,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   y.row[i] = w[i] * x.row[i]
+ * \f]
+ * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is
+ * (batchSize x 1) weight vector, and \f$y\f$ is (batchSize x dataDim) output.
+ *
+ * The config file api is scaling_layer.
+ */
+
+class ScalingLayer : public Layer {
+ public:
+  explicit ScalingLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ScalingLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(scaling, ScalingLayer);
+
+bool ScalingLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  return true;
+}
+
+void ScalingLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  CHECK_EQ(dataDim, getSize());
+  CHECK_EQ(weightV->getWidth(), 1U);
+  CHECK_EQ(weightV->getHeight(), batchSize);
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwScalingTimer", getName().c_str());
+    // outV += inV1 * weight
+    outV->addRowScale(0, *inV1, *weightV);
+  }
+}
+
+void ScalingLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr outG = getOutputGrad();
+
+  {
+    REGISTER_TIMER_INFO("BwScalingTimer", getName().c_str());
+
+    if (inG0) {
+      // inG0 += outG .* inV1
+      inG0->rowDotMul(0, *outG, *inV1);
+    }
+
+    if (inG1) {
+      // inG1 += outG * weight;
+      inG1->addRowScale(0, *outG, *weightV);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ScalingProjection.cpp b/paddle/legacy/gserver/layers/ScalingProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/ScalingProjection.cpp
rename to paddle/legacy/gserver/layers/ScalingProjection.cpp
diff --git a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a181f55d91f07ac6863084d9f5f724c47ef4e13b
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -0,0 +1,336 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SelectiveFullyConnectedLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(selective_fc, SelectiveFullyConnectedLayer);
+
+bool SelectiveFullyConnectedLayer::init(const LayerMap& layerMap,
+                                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  inputNum_ = inputLayers_.size();
+  if (config_.has_selected_colums()) {
+    inputNum_ -= 1;
+  }
+  for (size_t i = 0; i < inputNum_; i++) {
+    size_t height = inputLayers_[i]->getSize();
+    size_t width = getSize();
+    // NOTE weight is transpoed
+    weights_.emplace_back(new Weight(width, height, parameters_[i]));
+  }
+
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  fullOutput_ = false;
+
+  return true;
+}
+
+void SelectiveFullyConnectedLayer::prefetch() {}
+
+void SelectiveFullyConnectedLayer::reserveOutput(size_t height,
+                                                 size_t width,
+                                                 size_t nnz) {
+  bool flag = (passType_ == PASS_TEST &&
+               config_.selective_fc_pass_generation() && !fullOutput_);
+  SetDevice device(output_.deviceId);
+  if (flag) {
+    // output_.value is sparse matrix
+    if (dynamic_cast<CpuMatrix*>(output_.value.get()) ||
+        dynamic_cast<GpuMatrix*>(output_.value.get())) {
+      output_.value = nullptr;
+    }
+    Matrix::resizeOrCreateSparseMatrix(output_.value,
+                                       height,
+                                       width,
+                                       nnz,
+                                       FLOAT_VALUE,
+                                       SPARSE_CSR,
+                                       /*trans=*/false,
+                                       /*useGpu=*/useGpu_);
+    output_.value->copyFrom(*selCols_);
+    interOutput_ = output_.value;
+  } else {
+    if (fullOutput_) {
+      // output_.value is dense matrix
+      if (dynamic_cast<CpuSparseMatrix*>(output_.value.get()) ||
+          dynamic_cast<GpuSparseMatrix*>(output_.value.get())) {
+        output_.value = nullptr;
+      }
+      Matrix::resizeOrCreate(output_.value,
+                             height,
+                             width,
+                             /*trans=*/false,
+                             /*useGpu=*/useGpu_);
+      interOutput_ = output_.value;
+    } else {
+      // output_.value is dense matrix, but width = nnz /height
+      CHECK_EQ(nnz % height, 0U);
+      CHECK(nnz / height);
+      Matrix::resizeOrCreate(output_.value,
+                             height,
+                             nnz / height,
+                             /*trans=*/false,
+                             /*useGpu=*/useGpu_);
+      interOutput_ = Matrix::createSparseMatrix(output_.value->getData(),
+                                                selCols_->getRows(),
+                                                selCols_->getCols(),
+                                                height,
+                                                width,
+                                                nnz,
+                                                FLOAT_VALUE,
+                                                SPARSE_CSR,
+                                                /*trans=*/false,
+                                                /*useGpu=*/useGpu_);
+    }
+  }
+  interOutput_->zeroMem();
+
+  if (passType_ != PASS_TEST && needGradient()) {
+    CHECK_EQ(nnz % height, 0U) << "during training, each sample must have a "
+                                  "same number of selected columns.";
+    CHECK(nnz / height)
+        << "during training, "
+           "each sample must have at least one column selected.";
+    Matrix::resizeOrCreate(output_.grad,
+                           height,
+                           nnz / height,
+                           /*trans=*/false,
+                           /*useGpu=*/useGpu_);
+    output_.grad->zeroMem();
+  }
+}
+
+void SelectiveFullyConnectedLayer::forward(PassType passType) {
+  REGISTER_TIMER("selective_fc.forward");
+  Layer::forward(passType);
+
+  getSelectiveCols();
+  size_t height = getInput(0).getBatchSize();
+  size_t width = getSize();
+  size_t nnz = height * width;
+  if (!fullOutput_) {
+    CHECK(selCols_);
+    CHECK(height == selCols_->getHeight());
+    CHECK(width == selCols_->getWidth());
+    nnz = selCols_->getElementCnt();
+  }
+
+  // Layer::ResetOutput(), here we set outV/outG as SparseMatrix manually
+  // this outV should be used as input of MaxIdLayer and softmax activation
+  reserveOutput(height, width, nnz);
+
+  bool flag = true;
+  for (size_t i = 0; i < inputNum_; i++) {
+    MatrixPtr input = getInputValue(i);
+    MatrixPtr weight = weights_[i]->getW();
+    size_t hsize = input->getHeight();
+    size_t wsize = weight->getHeight();
+    real scaleT = i == 0 ? real(0) : real(1);
+
+    flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() &&
+           !fullOutput_;
+    if (flag) {
+      // if the indecies are highly sparse,
+      // manully compute the multiplication of
+      // the input vector and the selected rows.
+      REGISTER_TIMER("selective.plain");
+      interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
+    } else {
+      // if the indecies is not sparse enough,
+      // use full mul instead
+      REGISTER_TIMER("selective.mul");
+      if (fullOutput_) {
+        interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
+      } else {
+        Matrix::resizeOrCreate(mmat_,
+                               hsize,
+                               wsize,
+                               /*trans=*/false,
+                               /*useGpu=*/useGpu_);
+        mmat_->mul(*input, *weight->getTranspose());
+        interOutput_->add3(mmat_);
+      }
+    }
+  }
+
+  if (biases_) {
+    interOutput_->addBias(*(biases_->getW()), 1);
+  }
+
+  flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() &&
+          !fullOutput_);
+  if (flag) {
+    // during generation, output of this layer is a sparse csr matrix,
+    // which is probably the input of maxid layer
+    // if the model is trained with multi-class-cross-entroy-with-selfnorm,
+    // activiation of this layer should be exponential, not softmax.
+
+    Argument arg;
+    arg.value = Matrix::create(interOutput_->getData(),
+                               1,
+                               nnz,
+                               /*trans=*/false,
+                               /*useGpu=*/useGpu_);
+    //! TODO(yuyang18): Why we cannot invoke forwardActivation here?
+    activation_->forward(arg).check();
+  } else /* train and test in train, not generating */ {
+    // during training, this layer output value is *Matrix*, which is input of
+    // eg. multi-class-cross-entropy
+
+    // while training, every sample has a equal number of selected
+    // columns to be activated.
+    // note indices of multi-class-cross-entropy need to be remapped
+    // to this index.
+    // e.g. sample = [1,3,5] and 3 is gold, then label is 1
+
+    forwardActivation();
+  }
+}
+
+void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
+  backwardActivation();
+  MatrixPtr oGrad = getOutputGrad();
+  if (!fullOutput_) {
+    interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(),
+                                               interOutput_->getRows(),
+                                               interOutput_->getCols(),
+                                               interOutput_->getHeight(),
+                                               interOutput_->getWidth(),
+                                               interOutput_->getElementCnt(),
+                                               FLOAT_VALUE,
+                                               SPARSE_CSR,
+                                               /*trans=*/false,
+                                               /*useGpu=*/useGpu_);
+  } else {
+    interOutGrad_ = Matrix::create(oGrad->getData(),
+                                   oGrad->getHeight(),
+                                   oGrad->getWidth(),
+                                   /*trans=*/false,
+                                   /*useGpu=*/useGpu_);
+  }
+
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*interOutGrad_, 1);
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  // backward is different from FullyConnectedLayer
+  // because the weight is transposed
+  for (size_t i = 0; i < inputNum_; i++) {
+    AsyncGpuBlock block;
+    MatrixPtr preGrad = getInputGrad(i);
+    if (preGrad) {
+      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
+      preGrad->mul(*interOutGrad_, *weights_[i]->getW(), 1, 1);
+    }
+
+    MatrixPtr wGrad = weights_[i]->getWGrad();
+    if (wGrad) {
+      REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
+      MatrixPtr input = getInputValue(i);
+      wGrad->mul(*interOutGrad_->getTranspose(), *input, 1, 1);
+    }
+
+    {
+      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+
+void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
+    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates) {
+  if (candidates == nullptr) {
+    fillFullySelectiveData();
+    return;
+  }
+
+  size_t sampleNum = candidates->size();
+  size_t outputWidth = getSize();
+  size_t nnz =
+      std::accumulate(candidates->begin(),
+                      candidates->end(),
+                      0UL,
+                      [](size_t a, const std::pair<int*, size_t>& arr) {
+                        return a + arr.second;
+                      });
+
+  Matrix::resizeOrCreateSparseMatrix(this->cpuSelCols_,
+                                     sampleNum,
+                                     outputWidth,
+                                     nnz,
+                                     NO_VALUE,
+                                     SPARSE_CSR,
+                                     false,
+                                     false);
+  CHECK(this->cpuSelCols_ != nullptr);
+  CpuSparseMatrixPtr selCols =
+      std::dynamic_pointer_cast<CpuSparseMatrix>(cpuSelCols_);
+  int* rowOffsets = selCols->getRows();
+  int* colIndices = selCols->getCols();
+
+  rowOffsets[0] = 0;
+  int idx = 0;
+  for (size_t i = 0; i < sampleNum; ++i) {
+    if ((*candidates)[i].second > 0) {
+      rowOffsets[i + 1] = rowOffsets[i] + (*candidates)[i].second;
+      for (size_t j = 0; j < (*candidates)[i].second; ++j) {
+        colIndices[idx] = (*candidates)[i].first[j];
+        idx++;
+      }
+    } else {
+      rowOffsets[i + 1] = rowOffsets[i];
+    }
+  }
+
+  CHECK_EQ(static_cast<size_t>(rowOffsets[sampleNum]), nnz);
+  if (!useGpu_) {
+    this->selCols_ = this->cpuSelCols_;
+  } else {
+    Matrix::resizeOrCreateSparseMatrix(this->selCols_,
+                                       sampleNum,
+                                       outputWidth,
+                                       nnz,
+                                       NO_VALUE,
+                                       SPARSE_CSR,
+                                       false,
+                                       true);
+    this->selCols_->copyFrom(*cpuSelCols_, HPPL_STREAM_1);
+    hl_stream_synchronize(HPPL_STREAM_1);
+  }
+
+  fullOutput_ = false;
+}
+
+void paddle::SelectiveFullyConnectedLayer::getSelectiveCols() {
+  if (config_.has_selected_colums()) {
+    this->selCols_ = inputLayers_[inputNum_]->getOutputValue();
+    fullOutput_ = false;
+  } else if (!config_.selective_fc_pass_generation() || selCols_ == nullptr) {
+    this->fillFullySelectiveData();
+  }  // else selCols_ is initialized by fillSelectiveData
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..068da57d8d2a79297ce95acd0f152514c9dcb65e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * @brief The SelectiveFullyConnectedLayer class
+ *
+ * SelectiveFullyConnectedLayer differs from FullyConnectedLayer by that it
+ * requires an additional input to indicate several selected columns, and only
+ * compute the multiplications between the input matrices and the selected
+ * columns of the parameter matrices of this layer. If the selected columns is
+ * not specified, SelectiveFullyConnected layer acts exactly like
+ * FullyConnectedLayer.
+ *
+ * The config file api is selective_fc_layer.
+ */
+class SelectiveFullyConnectedLayer : public Layer {
+ protected:
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+
+ private:
+  /**
+   * Get selected columns each forward.
+   */
+  void getSelectiveCols();
+
+  MatrixPtr mmat_;
+  /// cpuSelCols_ is a CpuSparseMatrix, used to save selected columns.
+  MatrixPtr cpuSelCols_;
+  /// CpuSparseMatrix or GpuSparseMatrix. In CPU mode, selCols_ points
+  /// to cpuSelCols_.
+  MatrixPtr selCols_;
+  size_t inputNum_;
+
+  /// interOutput_ shared same memory with output_.value.
+  MatrixPtr interOutput_;
+
+  /// if fullOutput_ is false, interOutGrad_ sparse matrix
+  MatrixPtr interOutGrad_;
+
+  /// if true, means output_.value is the same as Fc Layer
+  bool fullOutput_;
+
+ public:
+  explicit SelectiveFullyConnectedLayer(const LayerConfig& config)
+      : Layer(config), selCols_(nullptr) {}
+
+  ~SelectiveFullyConnectedLayer() {}
+  void prefetch() override;
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+
+  /**
+   * @brief Resize the output matrix size.
+   * And reset value to zero
+   */
+  void reserveOutput(size_t height, size_t width, size_t nnz);
+
+  /**
+   * @brief Fill candidates to select several activations as output.
+   * @param candidates specifies several selected columns of the parameter
+   * matrices of this layer.
+   * Multiplications only between the input matrices and the selected columns
+   * are computed.
+   * If the candidates is a nullptr, selective fc layer acts exactly like the
+   * fully connected layer.
+   * @note CURRENTLY, THIS METHOD IS ONLY USED FOR BEAM SEARCH
+   */
+  void fillSelectiveData(
+      const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates);
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  /**
+   * @brief Make SelectiveFC act as FullyConnectedLayer
+   */
+  void fillFullySelectiveData() { fullOutput_ = true; }
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp b/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..024ca048b4a0fb28b068fd69fafcfd9b313dbb72
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
@@ -0,0 +1,189 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for concatenating the first sequence with the second sequence
+ * Input: two sequences each containing the same number of instances
+ *        seq1 = [a1, a2, ..., an]
+ *        seq2 = [b1, b2, ..., bn]
+ * Output: a concatenated sequence of the two input sequences
+ *        out = [a1, b1, a2, b2, ..., an, bn]
+ */
+
+class SequenceConcatLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+
+ public:
+  explicit SequenceConcatLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~SequenceConcatLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(seqconcat, SequenceConcatLayer);
+
+bool SequenceConcatLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  // sequene concatenation layer should have exactly 2 inputs
+  CHECK_EQ(2U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceConcatLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t dim = getSize();
+
+  const Argument& input1 = getInput(0);
+  size_t numSequences1 = input1.getNumSequences();
+  auto startPositions1 = input1.sequenceStartPositions->getVector(false);
+
+  const Argument& input2 = getInput(1);
+  size_t numSequences2 = input2.getNumSequences();
+  auto startPositions2 = input2.sequenceStartPositions->getVector(false);
+
+  CHECK_EQ(dim, input1.value->getWidth());
+  CHECK_EQ(startPositions1->getData()[numSequences1], input1.getBatchSize());
+  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
+
+  CHECK_EQ(dim, input2.value->getWidth());
+  CHECK_EQ(startPositions2->getData()[numSequences2], input2.getBatchSize());
+  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
+
+  CHECK_EQ(numSequences1, numSequences2);
+
+  MatrixPtr inputValue1 = getInputValue(0);
+  MatrixPtr inputValue2 = getInputValue(1);
+
+  // reset output
+  reserveOutput(inputValue1->getHeight() + inputValue2->getHeight(), dim);
+
+  MatrixPtr outputValue = getOutputValue();
+
+  const int* starts1 = startPositions1->getData();
+  const int* starts2 = startPositions2->getData();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceConcatLayerForward", getName().c_str());
+
+    size_t offset = 0;
+    size_t leftNumIns = 0;
+    size_t rightNumIns = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      leftNumIns = starts1[seqId + 1] - starts1[seqId];
+      outputValue->subMatrix(offset, leftNumIns)
+          ->assign(*(inputValue1->subMatrix(starts1[seqId], leftNumIns)));
+      offset += leftNumIns;
+
+      rightNumIns = starts2[seqId + 1] - starts2[seqId];
+      outputValue->subMatrix(offset, rightNumIns)
+          ->assign(*(inputValue2->subMatrix(starts2[seqId], rightNumIns)));
+      offset += rightNumIns;
+    }
+
+    // modify the sequenceStartPositions
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences1 + 1, false);
+
+    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
+
+    for (size_t seqId = 0; seqId < numSequences1 + 1; ++seqId) {
+      tgtBuf[seqId] = starts1[seqId] + starts2[seqId];
+    }
+  }
+
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void SequenceConcatLayer::backward(const UpdateCallback& callback) {
+  /* activation */
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr inputGrad1 = getInputGrad(0);
+  MatrixPtr inputGrad2 = getInputGrad(1);
+  MatrixPtr outputGrad = getOutputGrad();
+  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
+  auto startPositions2 = getInput(1).sequenceStartPositions->getVector(false);
+
+  size_t numSequences1 = startPositions1->getSize() - 1;
+  size_t numSequences2 = startPositions2->getSize() - 1;
+
+  CHECK_EQ(numSequences1, numSequences2);
+
+  const int* starts1 = startPositions1->getData();
+  const int* starts2 = startPositions2->getData();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceConcatLayerBackward", getName().c_str());
+
+    size_t offset = 0;
+    size_t leftNumIns = 0;
+    size_t rightNumIns = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      leftNumIns = starts1[seqId + 1] - starts1[seqId];
+      if (inputGrad1) {
+        inputGrad1->subMatrix(starts1[seqId], leftNumIns)
+            ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
+      }
+      offset += leftNumIns;
+
+      rightNumIns = starts2[seqId + 1] - starts2[seqId];
+      if (inputGrad2) {
+        inputGrad2->subMatrix(starts2[seqId], rightNumIns)
+            ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
+      }
+      offset += rightNumIns;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b00bf65997b656379463f651b6ff78ddd03ee300
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -0,0 +1,118 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Logging.h"
+
+#include "SequencePoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for extracting the last instance of the input sequence.
+ * Input: a sequence
+ * If SequenceLevel = kNonseq:
+ *   Output: a sequence containing only the last instance of the input sequence
+ *   If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and getting last instance
+ *              operation is then applied to each interval independently.
+ * If SequenceLevel = kSeq:
+ *   Check input sequence must has sub-sequence
+ *   Output: a sequence containing only the last instance of each sub-sequence
+ *           of the input sequence
+ *
+ * The config file api is last_seq and first_seq.
+ */
+
+class SequenceLastInstanceLayer : public SequencePoolLayer {
+ protected:
+  MatrixPtr tmpSrc_;
+  MatrixPtr tmpDest_;
+  std::vector<int> instanceIds_;
+
+ public:
+  explicit SequenceLastInstanceLayer(const LayerConfig& config)
+      : SequencePoolLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
+
+bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  SequencePoolLayer::init(layerMap, parameterMap);
+  reversed_ = config_.select_first();
+
+  tmpSrc_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  tmpDest_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+
+  return true;
+}
+
+void SequenceLastInstanceLayer::forward(PassType passType) {
+  SequencePoolLayer::forward(passType);
+
+  auto starts = startPositions_->getData(false);
+  MatrixPtr inputValue = getInputValue(0);
+  MatrixPtr outputValue = getOutputValue();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
+
+    instanceIds_.clear();
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
+      int insId = reversed_ ? starts[seqId] : starts[seqId + 1] - 1;
+      instanceIds_.push_back(insId);
+
+      outputValue->subMatrix(seqId, 1, tmpDest_)
+          ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
+    }
+  }
+
+  if (biases_.get() != NULL) {
+    outputValue->addBias(*(biases_->getW()), 1);
+  }
+
+  /*  activation, should set to 'linear' in most cases */
+  forwardActivation();
+}
+
+void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
+  SequencePoolLayer::backward(callback);
+
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  if (inputGrad) {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
+
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
+      inputGrad->subMatrix(instanceIds_[seqId], 1, tmpDest_)
+          ->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/legacy/gserver/layers/SequencePoolLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/SequencePoolLayer.cpp
rename to paddle/legacy/gserver/layers/SequencePoolLayer.cpp
diff --git a/paddle/legacy/gserver/layers/SequencePoolLayer.h b/paddle/legacy/gserver/layers/SequencePoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c019b313093f4ac717e0fc57a9aa798e2951580
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequencePoolLayer.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
+ *
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sequence}{input[i]}
+ *    If stride_ > 0:
+ *        Check input sequence must not have sub-sequence
+ *        Output: a shorten sequence. Stride is the step size by which we slide
+ *                a window upon the input sequence, and the pooling operation
+ *                is then applied to each interval independently.
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+
+class SequencePoolLayer : public Layer {
+ protected:
+  int type_;
+  std::unique_ptr<Weight> biases_;
+  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
+  size_t newBatchSize_;
+  ICpuGpuVectorPtr startPositions_;
+  int stride_;
+  // Whether the input sequence is reversed or not.
+  bool reversed_ = false;
+
+ public:
+  explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp b/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f72acadec9d8f6108fe5f9c79ae7924c2b010d4d
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ *  A layer for reshaping the sequence. Assume the input sequence has
+ *  T instances, the dimension of each instance is M, and the input
+ *  reshape_dim is N, then the output sequence has T*M/N instances,
+ *  the dimension of each instance is N.
+ *
+ *  Note that T*M/N must be an integer.
+ */
+
+class SequenceReshapeLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+
+  MatrixPtr reshapedOutputGrad;
+
+ public:
+  explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(seqreshape, SequenceReshapeLayer);
+
+bool SequenceReshapeLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(1U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceReshapeLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+
+  size_t inDim = input.value->getWidth();
+  size_t outDim = getSize();
+
+  size_t numSequences = input.getNumSequences();
+
+  // by default, we assume each instance as a sequence
+  IVectorPtr seqStarts;
+  IVector::resizeOrCreate(seqStarts, input.getBatchSize() + 1, false);
+  int* startsData = seqStarts->getData();
+  for (int i = 0; i < input.getBatchSize() + 1; i++) {
+    startsData[i] = i;
+  }
+  const int* starts = startsData;
+
+  // if there is sequence, then use start positions
+  if (input.sequenceStartPositions) {
+    auto startPositions = input.sequenceStartPositions->getVector(false);
+    starts = startPositions->getData();
+    CHECK_EQ(starts[numSequences], input.getBatchSize());
+    CHECK_EQ(numSequences, startPositions->getSize() - 1);
+  }
+
+  for (size_t seqID = 0; seqID < numSequences; seqID++) {
+    size_t inNumIns = starts[seqID + 1] - starts[seqID];
+    size_t outNumIns = inNumIns * inDim / outDim;
+    CHECK_EQ(outNumIns * outDim, inNumIns * inDim);
+  }
+
+  MatrixPtr inputValue = getInputValue(0);
+
+  // reset output
+  reserveOutput(inputValue->getHeight() * inDim / outDim, outDim);
+  MatrixPtr outputValue = getOutputValue();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceReshapeLayerForward", getName().c_str());
+
+    outputValue->copyFrom(*inputValue);
+
+    // modify the sequenceStartPositions
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences + 1, false);
+
+    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
+
+    for (size_t seqId = 0; seqId < numSequences + 1; ++seqId) {
+      tgtBuf[seqId] = starts[seqId] * inDim / outDim;
+    }
+  }
+
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void SequenceReshapeLayer::backward(const UpdateCallback& callback) {
+  /* activation */
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  AsyncGpuBlock asyncGpuBlock;
+  REGISTER_TIMER_INFO("SequenceReshapeLayerBackward", getName().c_str());
+
+  if (inputGrad) {
+    Matrix::resizeOrCreate(reshapedOutputGrad,
+                           inputGrad->getHeight(),
+                           inputGrad->getWidth(),
+                           false,
+                           useGpu_);
+    reshapedOutputGrad->copyFrom(*outputGrad);
+    inputGrad->add(*reshapedOutputGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp b/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..65b4787fed3aa029dac4663c8f8dd6097d952c44
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+class SequenceSliceLayer : public Layer {
+ public:
+  explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but the second and the (optional) third input which are some
+   * selected indices of the give sequence to trim the sequence, are actually
+   * filled with int types so that storing int types information in real number
+   * matrices is very dangerous, since real numbers will be convered to int
+   * types. If a user fills this matrix himself, invalid data may occor.
+   */
+
+  MatrixPtr startIdsOnCpu_;
+  MatrixPtr endIdsOnCpu_;
+
+  std::vector<int> selectedRows_;
+  IVectorPtr rowIndice_;
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+  std::vector<int> outSubSeqStartPos_;
+  std::vector<int> outSeqStartPos_;
+
+  void checkInputs();
+  void copySliceIdsToCpu();
+  void calSelectedRows(const MatrixPtr starts, const MatrixPtr ends);
+};
+
+REGISTER_LAYER(seq_slice, SequenceSliceLayer);
+
+bool SequenceSliceLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_GE(inputLayers_.size(), 2U);
+  CHECK_LE(inputLayers_.size(), 3U);
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceSliceLayer::checkInputs() {
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
+                           << "must be a sequence.";
+  const MatrixPtr indices1 = getInputValue(1);
+  CHECK_EQ(
+      indices1->getHeight(),
+      static_cast<size_t>(inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
+                                               : inputSeq.getNumSequences()))
+      << "Height of the second input should be equal to number of sequence "
+      << "in the first input.";
+  if (inputLayers_.size() == 3) {
+    const MatrixPtr indices2 = getInputValue(2);
+    CHECK_EQ(indices2->getHeight(), indices1->getHeight())
+        << "start indices and end indices should have the same height.";
+    CHECK_EQ(indices2->getWidth(), indices1->getWidth())
+        << "start indices and end indices should have the same Width.";
+  }
+}
+
+void SequenceSliceLayer::copySliceIdsToCpu() {
+  const MatrixPtr indices1 = getInputValue(1);
+  if (inputLayers_.size() == 2U) {
+    if (config_.select_first()) {
+      Matrix::resizeOrCreate(startIdsOnCpu_,
+                             indices1->getHeight(),
+                             indices1->getWidth(),
+                             false /* trans */,
+                             false /* useGpu */);
+      startIdsOnCpu_->copyFrom(*indices1);
+      endIdsOnCpu_ = nullptr;
+    } else {
+      Matrix::resizeOrCreate(endIdsOnCpu_,
+                             indices1->getHeight(),
+                             indices1->getWidth(),
+                             false /* trans */,
+                             false /* useGpu */);
+      endIdsOnCpu_->copyFrom(*indices1);
+      startIdsOnCpu_ = nullptr;
+    }
+  } else if (inputLayers_.size() == 3U) {
+    Matrix::resizeOrCreate(startIdsOnCpu_,
+                           indices1->getHeight(),
+                           indices1->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    startIdsOnCpu_->copyFrom(*indices1);
+
+    const MatrixPtr indices2 = getInputValue(2);
+    Matrix::resizeOrCreate(endIdsOnCpu_,
+                           indices2->getHeight(),
+                           indices2->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    endIdsOnCpu_->copyFrom(*indices2);
+  }
+}
+
+void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
+                                         const MatrixPtr ends) {
+  CHECK(starts || ends) << "At least one of the start or end indices "
+                        << "should be given.";
+
+  bool hasSubseq = getInput(0).hasSubseq();
+
+  outSeqStartPos_.resize(1, 0);
+  outSubSeqStartPos_.resize(1, 0);
+  selectedRows_.clear();
+
+  size_t beamSize = starts ? starts->getWidth() : ends->getWidth();
+  size_t rowIdx = 0;
+  for (size_t i = 0; i < inputSeqInfoVec_.size(); ++i) {
+    for (size_t j = 0; j < inputSeqInfoVec_[i].size() - 1; ++j) {
+      for (size_t k = 0; k < beamSize; ++k) {
+        if (starts && starts->getElement(rowIdx, k) == -1.) break;
+        if (ends && ends->getElement(rowIdx, k) == -1.) break;
+
+        int begPos = inputSeqInfoVec_[i][j];
+        if (starts) begPos += starts->getElement(rowIdx, k);
+
+        int endPos = inputSeqInfoVec_[i][j + 1] - 1;
+        if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
+
+        int seqLen = endPos - begPos + 1;
+        CHECK_GT(seqLen, 0);
+        for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
+        hasSubseq
+            ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
+            : outSeqStartPos_.push_back(outSeqStartPos_.back() + seqLen);
+      }
+      rowIdx++;
+    }
+    if (hasSubseq) outSeqStartPos_.push_back(outSubSeqStartPos_.back());
+  }
+
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartPos_.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartPos_.data(), outSeqStartPos_.size(), false);
+
+  if (hasSubseq) {
+    ICpuGpuVector::resizeOrCreate(
+        output_.subSequenceStartPositions, outSubSeqStartPos_.size(), false);
+    output_.subSequenceStartPositions->copyFrom(
+        outSubSeqStartPos_.data(), outSubSeqStartPos_.size(), false);
+  }
+}
+
+void SequenceSliceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  checkInputs();
+
+  const Argument& inputSeq = getInput(0);
+  inputSeqInfoVec_.clear();
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  if (!useGpu_) {
+    if (inputLayers_.size() == 2U) {
+      startIdsOnCpu_ = config_.select_first() ? getInputValue(1) : nullptr;
+      endIdsOnCpu_ = config_.select_first() ? nullptr : getInputValue(1);
+    } else if (inputLayers_.size() == 3U) {
+      startIdsOnCpu_ = getInputValue(1);
+      endIdsOnCpu_ = getInputValue(2);
+    }
+  } else {
+    copySliceIdsToCpu();
+  }
+
+  /*
+   * calculate the selected row indices in a batch, and build the output
+   * sequence information.
+   */
+  calSelectedRows(startIdsOnCpu_, endIdsOnCpu_);
+
+  resetOutput(selectedRows_.size(), getSize());
+
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+
+void SequenceSliceLayer::backward(const UpdateCallback& callback) {
+  getOutputGrad()->addToRows(*getInputGrad(0), *rowIndice_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/legacy/gserver/layers/SequenceToBatch.cpp
similarity index 100%
rename from paddle/gserver/layers/SequenceToBatch.cpp
rename to paddle/legacy/gserver/layers/SequenceToBatch.cpp
diff --git a/paddle/legacy/gserver/layers/SequenceToBatch.h b/paddle/legacy/gserver/layers/SequenceToBatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ed517937d4a015b6b11de16412cac7599f5f8b9
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceToBatch.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+/*
+ * This class can used to modify the matrix structure of sequence matrix into
+ * batch structure.
+ * sequence matrix: [C1_s ... Cn_s | ...... | C1_t ... Cn_t]
+ * batch matrix:    [C1_s ... C1_t | ...... | Cn_s ... Cn_t]
+ * Cn_s is the state for sequence s at time n.
+ *
+ * Exampel:  sequence matrix = {{0, 0, 0, 0}, {1, 1, 1, 1, 1}, {2, 2, 2}}
+ *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+ *           batch matrix = {{1, 0, 2}, {1, 0, 2}, {1, 0, 2}, {1, 0}, {1}}
+ *           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+ *
+ * Use:
+ * Input: seqMatrix, seqStarts(Sequence Start Positions)
+ * Output: batchMatrix
+ * 1. SequenceToBatch seq2batch;
+ * 2. seq2batch.resizeOrCreateBatch(seqStarts);     // calculate seq2BatchIdx
+ * 3. seq2batch.copy(seqMatrix, batchMatrix, true); // copy seq to batch matrix
+ *
+ */
+class SequenceToBatch {
+ public:
+  explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
+
+  /* resize and calculate the batchIndex_ */
+  void resizeOrCreateBatch(int batchSize,
+                           size_t numSequences,
+                           const int *seqStarts,
+                           bool reversed,
+                           bool prevBatchState = false);
+
+  /* sequence matrix and batch matrix copy:
+   * seq2batch: copy(seqValue, batchValue, true);
+   * batch2seq: copy(seqValue, batchValue, false);
+   */
+  void copy(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
+  /* sequence/batch matrix add to batch/sequence matrix */
+  void add(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
+  MatrixPtr getBatchValue(Matrix &batchValue, int batchId, int numRows = 0);
+
+  size_t getNumBatch() const { return numBatch_; }
+
+  /* resize or create a batch matrix(batchValue_) */
+  void resizeOrCreate(Matrix &seqValue);
+  /* copy seqValue to batchValue_ */
+  void copyFromSeq(Matrix &seqValue);
+  /* copy batchValue_ to seqValue */
+  void copyBackSeq(Matrix &seqValue);
+  MatrixPtr getBatchValue(int batchId, int numRows = 0);
+  MatrixPtr getBatchValue() { return batchValue_; }
+  /*tranfer preBatchOutput to batch struct*/
+  void prevOutput2Batch(Matrix &src, Matrix &dst);
+  /*get sequence output from batch struct*/
+  void getSeqOutputFromBatch(Matrix &sequence, Matrix &batch);
+
+  /* Copy the index from another seq2batch. */
+  void shareIndexWith(const SequenceToBatch &seq2batch) {
+    CHECK(useGpu_ == seq2batch.useGpu_);
+    batchStartPositions_ = seq2batch.batchStartPositions_;
+    seq2BatchIdx_ = seq2batch.seq2BatchIdx_;
+    cpuSeq2BatchIdx_ = seq2batch.cpuSeq2BatchIdx_;
+    numBatch_ = seq2batch.numBatch_;
+  }
+
+ protected:
+  void sequence2BatchCopy(Matrix &batch,
+                          Matrix &sequence,
+                          IVector &seq2BatchIdx,
+                          bool seq2batch);
+  void sequence2BatchAdd(Matrix &batch,
+                         Matrix &sequence,
+                         IVector &seq2BatchIdx,
+                         bool seq2batch);
+
+  IVectorPtr batchStartPositions_;
+  IVectorPtr seq2BatchIdx_;
+  IVectorPtr cpuSeq2BatchIdx_;
+  IVectorPtr cpuSeqIdx_;
+  IVectorPtr cpuSeqEndIdxInBatch_;
+  IVectorPtr seqIdx_;
+  IVectorPtr seqEndIdxInBatch_;
+  size_t numBatch_;
+  bool useGpu_;
+  MatrixPtr batchValue_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SliceProjection.cpp b/paddle/legacy/gserver/layers/SliceProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/SliceProjection.cpp
rename to paddle/legacy/gserver/layers/SliceProjection.cpp
diff --git a/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp b/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..beb288e4ad8a058458d0b3488e7b95ca53d65cb1
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for applying a slope and an intercept to the input
+ * element-wise.
+ * This layer is used in NEURAL TURING MACHINE.
+ * @note There is no activation and weight in this layer.
+ *
+ * \f[
+ *    y = ax + b
+ * \f]
+ *
+ * Here, a is scale and b is offset, which are provided as attributes of the
+ * layer.
+ *
+ * The config file api is slope_intercept_layer.
+ */
+
+class SlopeInterceptLayer : public Layer {
+ public:
+  explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(slope_intercept, SlopeInterceptLayer);
+
+bool SlopeInterceptLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void SlopeInterceptLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t size = getSize();
+
+  CHECK_EQ(size, inV->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwSlopeInterceptTimer", getName().c_str());
+    outV->mulScalar(*inV, config_.slope());
+    outV->add(config_.intercept());
+  }
+}
+
+void SlopeInterceptLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outG = getOutputGrad();
+
+  if (inG) {
+    REGISTER_TIMER_INFO("BwSlopeInterceptTimer", getName().c_str());
+    inG->add(*outG, config_.slope());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
rename to paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.cpp
diff --git a/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cdfba33b3db03e5a9d6b497675407287e936628
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "PoolProjection.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+/**
+ * @brief A layer for spatial pyramid pooling on the input image by taking
+ * the max, average, etc. within regions, so that the result vector of
+ * different sized images are of the same size.
+ *
+ * The config file api is spp_layer.
+ */
+
+class SpatialPyramidPoolLayer : public Layer {
+ protected:
+  size_t channels_;
+  size_t imgSizeW_;
+  size_t imgSizeH_;
+  size_t pyramidHeight_;
+  std::string poolType_;
+
+  std::vector<std::unique_ptr<PoolProjection>> poolProjections_;
+  std::vector<Argument> projOutput_;
+  std::vector<std::pair<size_t, size_t>> projCol_;
+
+ public:
+  explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  ProjectionConfig getConfig(size_t sizeX_,
+                             size_t sizeY_,
+                             size_t channels,
+                             size_t pyamidLevel_,
+                             std::string& poolType_);
+  size_t getSize();
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f648ec01c4a25ce598db9b6ca5583b24c51d57c
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
@@ -0,0 +1,187 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+class SubNestedSequenceLayer : public Layer {
+ public:
+  explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  /*
+   * This functions generates the indices of rows in a batch according to the
+   * indices of selected sub-sequence in each sequence.
+   *
+   * Examples:
+   * selectedIndices:
+   *   [
+   *     [0, 1, -1],
+   *     [0, 1, 2],
+   *     [0, -1, -1],
+   *     [0, 2, 3],
+   *   ]
+   * inputSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   *
+   * ths output is saved to private member rowIndice_;
+   * [0,1,2,3,4,5,6,7,8,9,15,16,17,18,19,20,21,23,24,25,26,27]
+   */
+
+  void calSelectedRows(const MatrixPtr selectedIndices,
+                       const std::vector<std::vector<int>>& inputSeqInfo);
+
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but the second is some selected indices of the give sequence to trim
+   * the nested sequence, are actually filled with int types so that storing
+   * int types information in real number matrices is very dangerous, since
+   * real numbers will be convered to int types. If a user fills this matrix
+   * himself, invalid data may occor.
+   *
+   * if the second input of this layer is on GPU memory, copy it to CPU memory.
+   */
+  MatrixPtr selIdsCpu_;
+
+  /*
+   * reorganize sequenceStartPositions and subSequenceStartPositions
+   * into a 2d vector to facilitate the sequence selection process.
+   */
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+
+  /* store the final selected row indices in a batch */
+  IVectorPtr rowIndice_;
+  /* rowIndice_ and selectedRows_ actually share a same memory. */
+  std::vector<int> selectedRows_;
+};
+
+REGISTER_LAYER(sub_nested_seq, SubNestedSequenceLayer);
+
+bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(2U, inputLayers_.size());
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SubNestedSequenceLayer::calSelectedRows(
+    const MatrixPtr selectedIndices,
+    const std::vector<std::vector<int>>& inputSeqInfo) {
+  selectedRows_.clear();
+
+  std::vector<int> outSeqStartInfo(1, 0);
+  std::vector<int> outSubSeqStartInfo(1, 0);
+
+  size_t seqNum = selectedIndices->getHeight();
+  size_t beamSize = selectedIndices->getWidth();
+  for (size_t i = 0; i < seqNum; ++i) {
+    for (size_t j = 0; j < beamSize; ++j) {
+      if (selectedIndices->getElement(i, j) == -1.) break;
+      size_t selSubSeqIdx = selectedIndices->getElement(i, j);
+      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);
+
+      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
+                         inputSeqInfoVec_[i][selSubSeqIdx];
+      for (size_t k = 0; k < subSeqLen; ++k)
+        selectedRows_.push_back(inputSeqInfoVec_[i][selSubSeqIdx] + k);
+      outSubSeqStartInfo.push_back(outSubSeqStartInfo.back() + subSeqLen);
+    }
+    outSeqStartInfo.push_back(outSubSeqStartInfo.back());
+  }
+
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartInfo.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartInfo.data(), outSeqStartInfo.size(), false);
+
+  ICpuGpuVector::resizeOrCreate(
+      output_.subSequenceStartPositions, outSubSeqStartInfo.size(), false);
+  output_.subSequenceStartPositions->copyFrom(
+      outSubSeqStartInfo.data(), outSubSeqStartInfo.size(), false);
+}
+
+void SubNestedSequenceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
+                              << "must be a nested sequence.";
+  const MatrixPtr selectedIndices = getInputValue(1);
+  CHECK_EQ(size_t(inputSeq.getNumSequences()), selectedIndices->getHeight());
+
+  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
+    /*
+     * Currently, the second input for this layer is generated by
+     * kmax_sequence_score_layer whose output is always stored on CPU,
+     * or a data_layer which canbe on GPU.
+     *
+     * If the second input is on GPU, copy it to CPU memory, because this
+     * input always uses very few memory, and operations related to it are
+     * all logic control, not computations.
+     */
+    Matrix::resizeOrCreate(selIdsCpu_,
+                           selectedIndices->getHeight(),
+                           selectedIndices->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    selIdsCpu_->copyFrom(*selectedIndices);
+  } else {
+    selIdsCpu_ = selectedIndices;
+  }
+
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  calSelectedRows(selIdsCpu_, inputSeqInfoVec_);
+
+  resetOutput(selectedRows_.size(), getSize());
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+
+void SubNestedSequenceLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inputSeqGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  if (inputSeqGrad) outputGrad->addToRows(*inputSeqGrad, *rowIndice_);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SubSequenceLayer.cpp b/paddle/legacy/gserver/layers/SubSequenceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b27550048c7c8930cf4a8b5e96fd06292f64070
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SubSequenceLayer.cpp
@@ -0,0 +1,226 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for taking the subsequence according to given offset and size
+ * Input: original sequence, offset, size
+ * Output: subsequence
+ */
+
+class SubSequenceLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+  MatrixPtr tmpSrc_;
+  MatrixPtr tmpDest_;
+
+ public:
+  explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(subseq, SubSequenceLayer);
+
+bool SubSequenceLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  // sequene concatenation layer should have exactly 2 inputs
+  CHECK_EQ(3U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  tmpSrc_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  tmpDest_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SubSequenceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t dim = getSize();
+
+  const Argument& input = getInput(0);
+  size_t numSequences1 = input.getNumSequences();
+  auto startPositions1 = input.sequenceStartPositions->getVector(false);
+
+  const Argument& offsetSeq = getInput(1);
+  size_t numSequences2 = offsetSeq.getNumSequences();
+  auto startPositions2 = offsetSeq.sequenceStartPositions->getVector(false);
+
+  const Argument& sizeSeq = getInput(2);
+  size_t numSequences3 = sizeSeq.getNumSequences();
+  auto startPositions3 = sizeSeq.sequenceStartPositions->getVector(false);
+
+  CHECK_EQ(dim, input.value->getWidth());
+
+  CHECK_EQ(startPositions1->getData()[numSequences1], input.getBatchSize());
+  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
+
+  CHECK_EQ(startPositions2->getData()[numSequences2], offsetSeq.getBatchSize());
+  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
+
+  CHECK_EQ(startPositions3->getData()[numSequences3], sizeSeq.getBatchSize());
+  CHECK_EQ(numSequences3, startPositions3->getSize() - 1);
+
+  CHECK_EQ(numSequences1, numSequences2);
+  CHECK_EQ(numSequences2, numSequences3);
+
+  MatrixPtr inputValue = input.value;
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
+
+  CHECK_EQ(offsetValue->getSize(), numSequences1);
+  CHECK_EQ(sizeValue->getSize(), numSequences1);
+
+  int* offsets = offsetValue->getData();
+  int* sizes = sizeValue->getData();
+
+  // get total height of output
+  size_t height = 0;
+  for (size_t seqId = 0; seqId < numSequences1; seqId++) {
+    height += sizes[seqId];
+  }
+
+  // reset output
+  resetOutput(height, dim);
+
+  MatrixPtr outputValue = getOutputValue();
+
+  const int* starts1 = startPositions1->getData();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SubSequenceLayerForward", getName().c_str());
+
+    size_t offsetIn = 0;
+    size_t offsetOut = 0;
+    size_t size = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      offsetIn = starts1[seqId] + offsets[seqId];
+      size = sizes[seqId];
+
+      outputValue->subMatrix(offsetOut, size, tmpDest_)
+          ->assign(*(inputValue->subMatrix(offsetIn, size, tmpSrc_)));
+
+      offsetOut += size;
+    }
+
+    // modify the sequenceStartPositions
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences1 + 1, false);
+
+    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
+    int offset = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      tgtBuf[seqId] = offset;
+      offset += sizes[seqId];
+    }
+    tgtBuf[numSequences1] = offset;
+  }
+
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void SubSequenceLayer::backward(const UpdateCallback& callback) {
+  /* activation */
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr inputGrad1 = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
+  size_t numSequences1 = startPositions1->getSize() - 1;
+  const int* starts1 = startPositions1->getData();
+
+  const Argument& offsetSeq = getInput(1);
+  const Argument& sizeSeq = getInput(2);
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
+
+  int* offsets = offsetValue->getData();
+  int* sizes = sizeValue->getData();
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SubSequenceLayerBackward", getName().c_str());
+
+    int offsetIn = 0;
+    int offsetOut = 0;
+    int size = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      offsetIn = starts1[seqId] + offsets[seqId];
+      size = sizes[seqId];
+
+      inputGrad1->subMatrix(offsetIn, size, tmpDest_)
+          ->add(*(outputGrad->subMatrix(offsetOut, size, tmpSrc_)));
+      offsetOut += size;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp b/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4cd173a8c79204946fdeb4eb107cd0d9234f675a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for sum-to-one normalization,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   out[i] = \frac {in[i]} {\sum_{k=1}^N in[k]}
+ * \f]
+ * where \f$in\f$ is a (batchSize x dataDim) input vector,
+ * and \f$out\f$ is a (batchSize x dataDim) output vector.
+ *
+ * The config file api is sum_to_one_norm_layer.
+ */
+
+class SumToOneNormLayer : public Layer {
+ protected:
+  /// reciprocalRowSum_ = \f$1 / \sum_{k=1}^N in[k]\f$
+  MatrixPtr reciprocalRowSum_;
+  /// dotSum = output_.grad \f$.*\f$ output_.value
+  MatrixPtr dotSum_;
+
+ public:
+  explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(sum_to_one_norm, SumToOneNormLayer);
+
+bool SumToOneNormLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void SumToOneNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = getSize();
+
+  CHECK_EQ(dataDim, inV->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwSumToOneNormTimer", getName().c_str());
+
+    Matrix::resizeOrCreate(reciprocalRowSum_, batchSize, 1, false, useGpu_);
+    inV->rowSum(*reciprocalRowSum_);
+
+    // todo: matrix checks
+    CHECK_GT(reciprocalRowSum_->getMin(), 0.0);
+
+    reciprocalRowSum_->scalarDiv(*reciprocalRowSum_, 1.0);
+
+    // outV = inV * reciprocalRowSum
+    outV->rowScale(0, *inV, *reciprocalRowSum_);
+  }
+}
+
+void SumToOneNormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  size_t batchSize = inV->getHeight();
+
+  if (inG) {
+    REGISTER_TIMER_INFO("BwSumToOneTimer", getName().c_str());
+
+    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
+
+    // dotSum = outG .* outV
+    dotSum_->zeroMem();
+    dotSum_->rowDotMul(0, *outG, *outV);
+
+    // inG += -1 * (dotSum / rowSum)
+    dotSum_->dotMul(*dotSum_, *reciprocalRowSum_);
+    inG->rowAdd(0, *inG, *dotSum_, -1.0);
+    // inG += outG * (1/rowSum)
+    inG->addRowScale(0, *outG, *reciprocalRowSum_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SwitchOrderLayer.cpp b/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/SwitchOrderLayer.cpp
rename to paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
diff --git a/paddle/gserver/layers/SwitchOrderLayer.h b/paddle/legacy/gserver/layers/SwitchOrderLayer.h
similarity index 100%
rename from paddle/gserver/layers/SwitchOrderLayer.h
rename to paddle/legacy/gserver/layers/SwitchOrderLayer.h
diff --git a/paddle/gserver/layers/TableProjection.cpp b/paddle/legacy/gserver/layers/TableProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/TableProjection.cpp
rename to paddle/legacy/gserver/layers/TableProjection.cpp
diff --git a/paddle/gserver/layers/TableProjection.h b/paddle/legacy/gserver/layers/TableProjection.h
similarity index 100%
rename from paddle/gserver/layers/TableProjection.h
rename to paddle/legacy/gserver/layers/TableProjection.h
diff --git a/paddle/gserver/layers/TensorLayer.cpp b/paddle/legacy/gserver/layers/TensorLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/TensorLayer.cpp
rename to paddle/legacy/gserver/layers/TensorLayer.cpp
diff --git a/paddle/legacy/gserver/layers/TensorLayer.h b/paddle/legacy/gserver/layers/TensorLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c30f7c8899e23ad21f39a574d598dcefa32c11e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/TensorLayer.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * @brief TensorLayer takes two input vectors.
+ * \f[
+ *     y_{i} = x_{1} * W_{i} * x_{2}^{\rm T}, i=0, 1, ...,K-1
+ * \f]
+ *
+ * - \f$x_{1}\f$: the first input, size is M.
+ * - \f$x_{2}\f$: the second input, size is N.
+ * - y: output, size is K.
+ * - \f$y_{i}\f$: i-th element of y.
+ * - \f$W_{i}\f$: the i-th learned weight, dimensions: [M, N].
+ * - \f$x_{2}^{\rm T}\f$: the transpose of \f$x_{2}\f$.
+ *
+ * The config file api is tensor_layer.
+ */
+
+class TensorLayer : public Layer {
+ protected:
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+
+ public:
+  explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/TransLayer.cpp b/paddle/legacy/gserver/layers/TransLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/TransLayer.cpp
rename to paddle/legacy/gserver/layers/TransLayer.cpp
diff --git a/paddle/legacy/gserver/layers/TransLayer.h b/paddle/legacy/gserver/layers/TransLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a6b13933f83f30a07ed63d722dbb612c64edae7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/TransLayer.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A layer for transposing a minibatch matrix.
+ * \f[
+     y = x^\mathrm{T}
+ * \f]
+ * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
+ *
+ * The config file api is trans_layer.
+ */
+class TransLayer : public Layer {
+ public:
+  explicit TransLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/TransposedFullMatrixProjection.cpp
rename to paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
diff --git a/paddle/gserver/layers/UpsampleLayer.cpp b/paddle/legacy/gserver/layers/UpsampleLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/UpsampleLayer.cpp
rename to paddle/legacy/gserver/layers/UpsampleLayer.cpp
diff --git a/paddle/legacy/gserver/layers/UpsampleLayer.h b/paddle/legacy/gserver/layers/UpsampleLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea12a711a8a8a630ccf800812e71b75bab73550d
--- /dev/null
+++ b/paddle/legacy/gserver/layers/UpsampleLayer.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * This layer transpose the pooling process.
+ * It takes two input, the first input is the input data, and
+ * the second is the mask data from the max-pool-with-mask layer.
+ *
+ */
+
+class UpsampleLayer : public Layer {
+ public:
+  explicit UpsampleLayer(const LayerConfig& config) : Layer(config) {}
+  ~UpsampleLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+  size_t getOutputSize();
+
+ protected:
+  size_t scale_, scaleY_;
+  size_t upsampleSize_, upsampleSizeY_;
+  size_t padOutX_, padOutY_;
+  size_t imgSize_, imgSizeY_;
+  size_t channels_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ValidationLayer.cpp b/paddle/legacy/gserver/layers/ValidationLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/ValidationLayer.cpp
rename to paddle/legacy/gserver/layers/ValidationLayer.cpp
diff --git a/paddle/legacy/gserver/layers/ValidationLayer.h b/paddle/legacy/gserver/layers/ValidationLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbc94e8ef570e2eec1d3737aca97bbf91c1392b2
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ValidationLayer.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory>
+
+#include "Layer.h"
+#include "paddle/legacy/gserver/evaluators/Evaluator.h"
+
+DECLARE_int32(trainer_id);
+
+namespace paddle {
+
+class ValidationLayer : public Layer {
+ public:
+  explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  LayerPtr getOutputLayer() { return inputLayers_[0]; }
+
+  LayerPtr getLabelLayer() { return inputLayers_[1]; }
+
+  LayerPtr getInfoLayer() {
+    assert(inputLayers_.size() > 2);
+    return inputLayers_[2];
+  }
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+  virtual void validationImp(MatrixPtr outputValue, IVectorPtr label) = 0;
+
+  void onPassEnd() override = 0;
+};
+
+/*
+ * AucValidation
+ */
+class AucValidation : public ValidationLayer {
+ public:
+  explicit AucValidation(const LayerConfig& config)
+      : ValidationLayer(config),
+        cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
+
+  void onPassEnd() override;
+
+  struct PredictionResult {
+    PredictionResult(real __out, int __label) : out(__out), label(__label) {}
+    real out;
+    int label;
+  };
+  std::vector<PredictionResult> predictArray_;
+
+ private:
+  bool passBegin_;
+  std::unique_ptr<Evaluator> evaluator_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+};
+
+/*
+ * positive-negative pair rate Validation
+ */
+class PnpairValidation : public ValidationLayer {
+ public:
+  explicit PnpairValidation(const LayerConfig& config)
+      : ValidationLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
+
+  void onPassEnd() override;
+
+ private:
+  bool passBegin_;
+  std::unique_ptr<Evaluator> evaluator_;
+};
+
+typedef std::shared_ptr<ValidationLayer> ValidationLayerPtr;
+}  // namespace paddle
diff --git a/paddle/gserver/layers/WarpCTCLayer.cpp b/paddle/legacy/gserver/layers/WarpCTCLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/WarpCTCLayer.cpp
rename to paddle/legacy/gserver/layers/WarpCTCLayer.cpp
diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/legacy/gserver/layers/WarpCTCLayer.h
similarity index 100%
rename from paddle/gserver/layers/WarpCTCLayer.h
rename to paddle/legacy/gserver/layers/WarpCTCLayer.h
diff --git a/paddle/gserver/tests/.gitignore b/paddle/legacy/gserver/tests/.gitignore
similarity index 100%
rename from paddle/gserver/tests/.gitignore
rename to paddle/legacy/gserver/tests/.gitignore
diff --git a/paddle/legacy/gserver/tests/CMakeLists.txt b/paddle/legacy/gserver/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..93ddf5aa233017d4f5139a8add6c69ef3a4682b4
--- /dev/null
+++ b/paddle/legacy/gserver/tests/CMakeLists.txt
@@ -0,0 +1,103 @@
+# gserver pacakge unittests
+add_simple_unittest(test_LinearChainCRF)
+add_simple_unittest(test_RecurrentLayer)
+
+if(NOT MOBILE_INFERENCE)
+  add_simple_unittest(test_MultinomialSampler)
+endif()
+
+function(gserver_test TARGET)
+  add_unittest_without_exec(${TARGET}
+      ${TARGET}.cpp
+      LayerGradUtil.cpp)
+  add_test(NAME ${TARGET}
+      COMMAND ${TARGET})
+endfunction()
+
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/concat_dotmul_a.conf
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_gserver_conf ALL DEPENDS concat_dotmul_a.conf)
+
+gserver_test(test_LayerGrad)
+gserver_test(test_CRFLayerGrad)
+gserver_test(test_CrossEntropyOverBeamGrad)
+gserver_test(test_SeqSliceLayerGrad)
+gserver_test(test_ActivationGrad)
+gserver_test(test_ConvTrans)
+gserver_test(test_PriorBox)
+gserver_test(test_DetectionOutput)
+gserver_test(test_ConvUnify)
+gserver_test(test_BatchNorm)
+gserver_test(test_KmaxSeqScore)
+gserver_test(test_Expand)
+gserver_test(test_MaxPoolingWithMaskOutput)
+gserver_test(test_Upsample)
+
+set(PYTHON_PATH 
+   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
+   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/legacy/gserver/tests)
+function(gserver_test_with_python TARGET)
+  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
+  add_test(NAME ${TARGET}
+    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
+endfunction()
+
+gserver_test_with_python(test_PyDataProvider2)
+if(WITH_PYTHON)
+    gserver_test_with_python(test_PyDataProvider)
+endif()
+if(NOT MOBILE_INFERENCE)
+    gserver_test_with_python(test_CompareTwoNets)
+    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
+    gserver_test_with_python(test_RecurrentGradientMachine)
+endif()
+
+########## test_MKLDNN layers and activations ##########
+if(WITH_MKLDNN)
+    add_unittest_without_exec(test_MKLDNN
+        test_MKLDNN.cpp
+        MKLDNNTester.cpp
+        LayerGradUtil.cpp)
+    add_test(NAME test_MKLDNN
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
+            WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
+endif()
+
+############### test_WarpCTCLayer #######################
+if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
+    add_unittest_without_exec(test_WarpCTCLayer
+        test_WarpCTCLayer.cpp)
+    add_test(NAME test_WarpCTCLayer
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
+endif()
+
+if(NOT MOBILE_INFERENCE)
+    ################## test_Evaluator #############
+    add_unittest(test_Evaluator
+        test_Evaluator.cpp)
+      
+    ########### test_NetworkCompare ###############
+    add_unittest_without_exec(test_NetworkCompare
+        test_NetworkCompare.cpp)
+    if(WITH_GPU)
+        set(use_gpu true)
+    else()
+        set(use_gpu false)
+    endif()
+    add_test(NAME test_NetworkCompare
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
+
+    ############ test_CompareSparse ################
+    add_unittest_without_exec(test_CompareSparse
+        test_CompareSparse.cpp)
+    if(NOT ON_TRAVIS)
+      add_test(NAME test_CompareSparse
+        COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 6
+                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
+    endif()
+endif()
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/legacy/gserver/tests/LayerGradUtil.cpp
similarity index 100%
rename from paddle/gserver/tests/LayerGradUtil.cpp
rename to paddle/legacy/gserver/tests/LayerGradUtil.cpp
diff --git a/paddle/legacy/gserver/tests/LayerGradUtil.h b/paddle/legacy/gserver/tests/LayerGradUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..941989a1da49d215b9ed4af72e732d6a62fd225d
--- /dev/null
+++ b/paddle/legacy/gserver/tests/LayerGradUtil.h
@@ -0,0 +1,329 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+
+#include "paddle/testing/TestUtil.h"
+using namespace std;  // NOLINT
+
+namespace paddle {
+enum InputType {
+  INPUT_DATA,         // dense vector
+  INPUT_LABEL,        // id
+  INPUT_DATA_TARGET,  // dense vector, but no gradient
+  INPUT_SEQUENCE_DATA,
+  INPUT_HASSUB_SEQUENCE_DATA,  // sequence has sub-sequence
+  INPUT_SEQUENCE_MDIM_DATA,
+  INPUT_SEQUENCE_LABEL,
+  INPUT_SPARSE_NON_VALUE_DATA,
+  INPUT_SPARSE_FLOAT_VALUE_DATA,
+  INPUT_DENSE_DIM_DATA,    // using sequence length to init dense data
+  INPUT_SELF_DEFINE_DATA,  // support customizing for input value
+};
+
+struct ParaSparse {
+  bool sparse;
+  string format;
+  // if equalNnzPerSample is set true,
+  // every row of the sparse matrix in a format of CSR has a same
+  // number of nnz values. Currently, this flag is only used for
+  // selective_fc layer
+  bool equalNnzPerSample;
+  ParaSparse(const string& formatIn = "") {  // NOLINT
+    if (formatIn == "") {
+      sparse = false;
+    } else {
+      sparse = true;
+    }
+    equalNnzPerSample = false;
+  }
+  ParaSparse(const string& formatIn, bool equalNnz) {
+    format = formatIn;
+    sparse = true;
+    equalNnzPerSample = equalNnz;
+  }
+};
+
+struct InputDef {
+  InputType inputType;
+  string name;
+  size_t dim;
+  size_t paraSize;
+  ParaSparse sparse;
+  bool isStatic;
+  std::vector<int> labelInitValue;
+  std::vector<int> labelSeqStartPositions;
+  std::vector<int> labelSubSeqStartPositions;
+  std::vector<int> ids;
+  MatrixPtr selfDefinedData;
+
+  InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
+    inputType = type;
+    name = nameIn;
+    dim = dimIn;
+    paraSize = sizeIn;
+    sparse = {""};
+    isStatic = false;
+  }
+
+  InputDef(InputType type,
+           string nameIn,
+           MatrixPtr selfDefinedData,
+           std::vector<int> selfDefinedSeqStartPos = {},
+           std::vector<int> selfDefinedSubSeqStartPos = {})
+      : labelSeqStartPositions(selfDefinedSeqStartPos),
+        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
+        selfDefinedData(selfDefinedData) {
+    inputType = type;
+    name = nameIn;
+    dim = 0;
+    sparse = {""};
+    paraSize = 0;
+    isStatic = false;
+  }
+
+  InputDef(InputType type,
+           string nameIn,
+           const std::vector<int>& ids,
+           const std::vector<int>& selfDefinedSeqStartPos = {},
+           const std::vector<int>& selfDefinedSubSeqStartPos = {})
+      : labelSeqStartPositions(selfDefinedSeqStartPos),
+        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
+        ids(ids) {
+    selfDefinedData = nullptr;
+    inputType = type;
+    name = nameIn;
+    dim = 0;
+    sparse = {""};
+    paraSize = 0;
+    isStatic = false;
+  }
+
+  InputDef(InputType type,
+           string nameIn,
+           size_t dimIn,
+           size_t sizeIn,
+           const std::vector<int>& labelInitValue,
+           const std::vector<int>& labelSeqStartPositions)
+      : labelInitValue(labelInitValue),
+        labelSeqStartPositions(labelSeqStartPositions) {
+    inputType = type;
+    name = nameIn;
+    dim = dimIn;
+    paraSize = sizeIn;
+    sparse = {""};
+    isStatic = false;
+  }
+
+  InputDef(InputType type,
+           string nameIn,
+           size_t dimIn,
+           size_t sizeIn,
+           ParaSparse sparseIn) {
+    inputType = type;
+    name = nameIn;
+    dim = dimIn;
+    paraSize = sizeIn;
+    sparse = sparseIn;
+  }
+};
+
+struct TestConfig {
+  LayerConfig layerConfig;
+  std::vector<InputDef> inputDefs;
+  size_t biasSize;
+  real paramInitialMean;
+  real paramInitialStd;
+  bool testAccumulate;
+  bool testState;
+  bool staticBias;
+  bool testBatchState;
+  TestConfig()
+      : biasSize(0),
+        paramInitialMean(0.0),
+        paramInitialStd(1.0),
+        testAccumulate(true),
+        testState(false),
+        staticBias(false),
+        testBatchState(false) {}
+};
+
+real getCostSum(ParameterPtr& parameter,
+                CpuVector& cpuPara,
+                LayerPtr& testLayer,
+                MatrixPtr weights = nullptr);
+
+real getDiffAndPrint(real newCost1,
+                     real newCost2,
+                     real callbackCount,
+                     char fill,
+                     string testLayerName,
+                     string name,
+                     real step,
+                     real delta);
+
+/**
+ * @brief verify that sequentially running forward() one timestamp at one time
+ *        has same result as running forward() with one whole sequence
+ *
+ * @param testLayer[in/out]    testLayer
+ * @param dataLayers[in/out]   dataLayers
+ * @param datas[in/out]        data of dataLayers
+ */
+void testState(LayerPtr testLayer,
+               vector<DataLayerPtr>& dataLayers,
+               vector<Argument>& datas);
+
+/**
+ * @brief verify that sequentially running forward() with short sequences one
+ *        time has same result as running forward() with long sequences.
+ *
+ * @param testLayer[in/out]    testLayer
+ * @param dataLayers[in/out]   dataLayers
+ * @param datas[in/out]        data of dataLayers
+ */
+void testBatchState(LayerPtr testLayer,
+                    vector<DataLayerPtr>& dataLayers,
+                    vector<Argument>& datas);
+
+/**
+ * @brief Generate a perturbation so that it is roughly aligned with the
+ *        gradient direction. This is to make sure that change along this
+ *        direction will make cost increase (or decrease) in a meaningful
+ *        way so that the finite difference can be used to approximate the
+ *        directional dirivative well.
+ *
+ * @param oldGrad[in]  input gradient
+ *        newGrad[out] output gradient
+ *        dim          dimension of oldGrad/newGrad
+ *
+ * @return sum_i(oldGrad[i] * newGrad[i])
+ */
+double genPerturbation(const real* oldGrad, real* newGrad, size_t dim);
+
+void initWeight(MatrixPtr& weights);
+
+void initBatchState(LayerPtr dataLayer,
+                    LayerPtr testLayer,
+                    LayerStatePtr state,
+                    bool useGpu);
+
+/**
+ * @brief initialize the dataLayer by its inputType
+ *
+ * @param testConf[in]        test config
+ *        dataLayers[out]     dataLayers
+ *        datas[out]          initialized data of dataLayers
+ *        layerMap[out]       layerMap
+ */
+void initDataLayer(TestConfig testConf,
+                   std::vector<DataLayerPtr>* dataLayers,
+                   vector<Argument>* datas,
+                   LayerMap* layerMap,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
+                   bool useGpu);
+
+/**
+ * @brief initialize the parameter of testLayer
+ *
+ * @param testConf[in/out]    test config
+ *        layerMap[out]       layerMap
+ *        parameters[out]     parameters of testLayer
+ *        testLayer[out]      testLayer
+ */
+void initTestLayer(TestConfig testConf,
+                   LayerMap* layerMap,
+                   std::vector<ParameterPtr>* parameters,
+                   LayerPtr* testLayer);
+
+/**
+ * @brief Test whether the layer's forward calculation is stable by adding
+ *        perturbation to its parameters
+ *
+ * @param testConf[in]         test config
+ *        weights[in]          weights of testLayer
+ *        state[in]            state of testLayer
+ *        cost[in]             input cost
+ *        callbackCount[in]    number of done callback
+ *        maxDiff[in/out]      max of all previous diff
+ *        testLayer[in/out]    testLayer
+ *        parameters[in/out]   parameters of testLayer
+ */
+void testPerturbParameter(TestConfig testConf,
+                          const MatrixPtr weights,
+                          const LayerStatePtr state,
+                          real cost,
+                          real callbackCount,
+                          real* maxDiff,
+                          LayerPtr testLayer,
+                          std::vector<ParameterPtr>* parameters);
+
+/**
+ * @brief Test whether the layer's forward calculation is stable by adding
+ *        perturbation to its input layers
+ *
+ * @param testConf[in]         test config
+ *        weights[in]          weights of testLayer
+ *        state[in]            state of testLayer
+ *        cost[in]             input cost
+ *        callbackCount[in]    number of done callback
+ *        maxDiff[in/out]      max of all previous diff
+ *        testLayer[in/out]    testLayer
+ *        dataLayers[in/out]   dataLayers
+ */
+void testPerturbInput(TestConfig testConf,
+                      const MatrixPtr weights,
+                      const LayerStatePtr state,
+                      real cost,
+                      real callbackCount,
+                      real* maxDiff,
+                      LayerPtr testLayer,
+                      std::vector<DataLayerPtr> dataLayers);
+
+void testLayerGradKernel(TestConfig testConf,
+                         string testLayerName,
+                         size_t batchSize,
+                         bool trans,
+                         bool useGpu,
+                         bool useWeight = false,
+                         float epsilon = 0.02);
+
+void testLayerGrad(TestConfig testConf,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
+                   bool useGpu,
+                   bool useWeight = false,
+                   float epsilon = 0.02);
+
+void testProjectionGrad(ProjectionConfig conf,
+                        InputType inputType,
+                        size_t parameterSize,
+                        size_t batchSize,
+                        bool useGpu,
+                        bool testState = false,
+                        int biasSize = 0,
+                        bool sharedBias = false);
+
+void testOperatorGrad(TestConfig& config,
+                      OperatorConfig& operatorConf,
+                      size_t batchSize,
+                      bool useGpu,
+                      bool testState = false);
+
+}  //  namespace paddle
diff --git a/paddle/legacy/gserver/tests/MKLDNNTester.cpp b/paddle/legacy/gserver/tests/MKLDNNTester.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bed58f94bb047e7d7bbc3c7746d2b484d3883861
--- /dev/null
+++ b/paddle/legacy/gserver/tests/MKLDNNTester.cpp
@@ -0,0 +1,580 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNTester.h"
+#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
+#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
+#include "paddle/trainer/Trainer.h"
+
+namespace paddle {
+
+// init data layer and test layer of both dnn and reference
+void MKLDNNTester::reset(const TestConfig& dnn,
+                         const TestConfig& ref,
+                         size_t batchSize) {
+  const bool trans = false;
+  const bool useGpu = false;
+
+  // clear
+  configs_.clear();
+  layerNames_.clear();
+  dataLayers_.clear();
+  datas_.clear();
+  layerMaps_.clear();
+  parameters_.clear();
+  testLayers_.clear();
+
+  // resize
+  configs_.resize(NUM);
+  layerNames_.resize(NUM);
+  dataLayers_.resize(NUM);
+  datas_.resize(NUM);
+  layerMaps_.resize(NUM);
+  parameters_.resize(NUM);
+  testLayers_.resize(NUM);
+
+  // reset configs and layer names
+  configs_[DNN] = dnn;
+  configs_[REF] = ref;
+  layerNames_[DNN] = "mkldnn";     // the first is mkldnn layer
+  layerNames_[REF] = "reference";  // second is reference layer
+
+  // reset others
+  for (size_t i = 0; i < NUM; ++i) {
+    configs_[i].layerConfig.set_name(layerNames_[i]);
+    initDataLayer(configs_[i],
+                  &(dataLayers_[i]),
+                  &(datas_[i]),
+                  &(layerMaps_[i]),
+                  layerNames_[i],
+                  batchSize,
+                  trans,
+                  useGpu);
+    initTestLayer(
+        configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
+  }
+  refLayer_ = testLayers_[REF];
+  dnnLayer_ = testLayers_[DNN];
+  EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
+  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  setInputImgSize();
+
+  // for comparison with Paddle reference results,
+  // need manually add cpu device output for test
+  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  if (dnnLayer) {
+    dnnLayer->addOutputArgument(CPU_DEVICE);
+  }
+}
+
+void MKLDNNTester::setInputImgSize() {
+  for (size_t n = 0; n < dataLayers_.size(); ++n) {
+    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+      // TODO(TJ): fix me when concat and elewise ready
+      dataLayers_[n][i]->getOutput().setFrameHeight(ih_);
+      dataLayers_[n][i]->getOutput().setFrameWidth(iw_);
+    }
+  }
+}
+
+// init randome parameters of ref, and copy to mkldnn
+void MKLDNNTester::randomWgtDatas() {
+  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  const bool isBN = refLayer_->getType() == "batch_norm";
+  for (size_t i = 0; i < parameters_[REF].size(); ++i) {
+    const VectorPtr& dnnValue = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& refValue = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
+    parameters_[REF][i]->randomize();
+    if (isBN && i == 2) {
+      // this param is moving average in batch norm, which must larger than 0
+      real offset = fabs(refValue->getMin()) + 1.0;
+      refValue->add(offset);
+    }
+    dnnValue->copyFrom(*refValue);
+
+    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
+    printVector(dnnValue);
+  }
+}
+
+// random botdata of ref layer and copy same to mkldnn
+void MKLDNNTester::randomBotDatas() {
+  CHECK_EQ(dataLayers_.size(), NUM);
+  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
+    dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
+    dataLayers_[DNN][i]->getOutputValue()->copyFrom(
+        *(dataLayers_[REF][i]->getOutputValue()));
+    VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
+    printMatrix(dataLayers_[REF][i]->getOutputValue());
+  }
+}
+
+void MKLDNNTester::randomTopDiffs() {
+  refLayer_->getOutputGrad()->randomizeUniform();
+  dnnLayer_->getOutput(CPU_DEVICE)
+      .grad->copyFrom(*(refLayer_->getOutputGrad()));
+  VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
+  printMatrix(refLayer_->getOutputGrad());
+}
+
+void MKLDNNTester::checkForward() {
+  VLOG(MKLDNN_TESTS) << "Check Forward";
+  printTopDatas();
+  double delta =
+      compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
+  EXPECT_LE(fabs(delta), eps_);
+}
+
+void MKLDNNTester::checkBackwardData() {
+  VLOG(MKLDNN_TESTS) << "Check Backward Data";
+  const bool isBN = refLayer_->getType() == "batch_norm";
+  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
+    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
+    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
+    VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
+    printMatrix(dnnDiff);
+    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
+    printMatrix(refDiff);
+
+    double delta = compareMatrix(refDiff, dnnDiff);
+    EXPECT_LE(fabs(delta), eps_);
+    if (isBN) {
+      // the other two inputs in batch norm are for moving mean and var
+      // do not have grad to compare
+      break;
+    }
+  }
+}
+
+void MKLDNNTester::checkBackwardWgts() {
+  VLOG(MKLDNN_TESTS) << "Check Backward Weight";
+  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
+  saveWgt(parameters_[DNN], dnnWgts);
+
+  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  if (dnnLayer) {
+    dnnLayer->convertWeightsToPaddle();
+  }
+  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
+    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
+    VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
+                     << parameters_[DNN][i]->getName();
+    printVector(dnn);
+    VLOG(MKLDNN_ALL) << "Reference Result: weight value "
+                     << parameters_[REF][i]->getName();
+    printVector(ref);
+
+    double delta = compareVector(ref, dnn);
+    EXPECT_LE(fabs(delta), eps_);
+  }
+
+  VLOG(MKLDNN_ALL) << "Restore dnn weights before comapre";
+  restoreWgt(dnnWgts, parameters_[DNN]);
+}
+
+void MKLDNNTester::saveWgt(const vector<ParameterPtr>& from,
+                           vector<VectorPtr>& to) {
+  const bool useGpu = false;
+  to.resize(from.size());
+  for (size_t i = 0; i < to.size(); ++i) {
+    const VectorPtr& wgt = from[i]->getBuf(PARAMETER_VALUE);
+    to[i] = Vector::create(wgt->getSize(), useGpu);
+    to[i]->copyFrom(*wgt);
+  }
+}
+
+void MKLDNNTester::restoreWgt(const vector<VectorPtr>& from,
+                              vector<ParameterPtr>& to) {
+  CHECK_EQ(from.size(), to.size());
+  for (size_t i = 0; i < from.size(); ++i) {
+    const VectorPtr& wgt = to[i]->getBuf(PARAMETER_VALUE);
+    wgt->copyFrom(*from[i]);
+  }
+}
+
+// clear parameters grad
+void MKLDNNTester::clearWgtDiffs(size_t id) {
+  CHECK_LE(id, parameters_.size());
+  for (size_t n = 0; n < parameters_.size(); ++n) {
+    if (id == n || id == parameters_.size()) {
+      for (size_t i = 0; i < parameters_[n].size(); ++i) {
+        const VectorPtr& grad = parameters_[n][i]->getBuf(PARAMETER_GRADIENT);
+        if (grad) {
+          grad->zeroMem();
+        }
+      }
+    }
+  }
+}
+
+void MKLDNNTester::clearBotDiffs(size_t id) {
+  CHECK_LE(id, dataLayers_.size());
+  for (size_t n = 0; n < dataLayers_.size(); ++n) {
+    if (id == n || id == dataLayers_.size()) {
+      // clear inputs layers of this specific layer
+      for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+        dataLayers_[n][i]->getOutputGrad()->zeroMem();
+      }
+    }
+  }
+}
+
+void MKLDNNTester::clearTopDatas(size_t id) {
+  CHECK_LE(id, testLayers_.size());
+  for (size_t i = 0; i < testLayers_.size(); ++i) {
+    if (id == i || id == testLayers_.size()) {
+      testLayers_[i]->getOutputValue()->zeroMem();
+    }
+  }
+}
+
+void MKLDNNTester::printTopDatas() {
+  if (!log_) {
+    return;
+  }
+
+  for (int n = 0; n < NUM; ++n) {
+    VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
+                     << " Forward Result: OutputValue";
+    printMatrix(testLayers_[n]->getOutputValue());
+  }
+}
+
+void MKLDNNTester::printMatrix(const MatrixPtr& m) {
+  if (!log_) {
+    return;
+  }
+
+  std::ostringstream ostr;
+  m->print(ostr);
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
+}
+
+void MKLDNNTester::printVector(const VectorPtr& v) {
+  if (!log_) {
+    return;
+  }
+
+  std::ostringstream ostr;
+  v->print(ostr, v->getSize());
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
+}
+
+double MKLDNNTester::getDelta(const real* refer,
+                              const real* value,
+                              size_t len,
+                              const float failRate,
+                              const float thres) {
+  double delta = 0, sum = 0;
+  int failCnt = 0;
+  const double eps = 1e-5;
+  double maxRatio = 0;
+  for (size_t i = 0; i < len; ++i) {
+    double ref = fabs(refer[i]);
+    double val = fabs(value[i]);
+    double diff = fabs(refer[i] - value[i]);
+    delta += diff;
+    sum += ref;
+    if (ref < eps && val < eps) {  // both values are very small
+      continue;
+    }
+    double ratio = diff / ref;
+    if (ratio > thres) {
+      maxRatio = std::max(maxRatio, ratio);
+      failCnt++;
+    }
+  }
+  EXPECT_FALSE(std::isinf(sum));
+  EXPECT_FALSE(std::isnan(sum));
+  EXPECT_FALSE(std::isnan(delta));
+  VLOG(MKLDNN_ALL) << "reference avg data: " << sum / len
+                   << ", delta: " << delta / sum << ", failCnt:" << failCnt;
+  double res = sum > eps ? delta / sum : eps;
+  return (failCnt / (float)len) > failRate ? maxRatio : res;
+}
+
+double MKLDNNTester::compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2) {
+  CHECK_EQ(m1->getElementCnt(), m2->getElementCnt());
+  return getDelta(m1->getData(), m2->getData(), m1->getElementCnt());
+}
+
+double MKLDNNTester::compareVector(const VectorPtr& v1, const VectorPtr& v2) {
+  CHECK_EQ(v1->getSize(), v2->getSize());
+  return getDelta(v1->getData(), v2->getData(), v1->getSize());
+}
+
+void MKLDNNTester::runOnce() {
+  // test forward
+  randomBotDatas();
+  dnnLayer_->forward(passType_);
+  refLayer_->forward(passType_);
+  checkForward();
+
+  if (passType_ == PASS_TEST) {
+    return;
+  }
+
+  // test backward
+  // simple updater
+  UpdateCallback updateCallback = [](Parameter* para) {
+    auto& grad = para->getBuf(PARAMETER_GRADIENT);
+    auto& value = para->getBuf(PARAMETER_VALUE);
+    real lr = 1e-2;
+    value->add(*grad, lr);
+    grad->zeroMem();
+  };
+  randomTopDiffs();
+  dnnLayer_->backward(updateCallback);
+  refLayer_->backward(updateCallback);
+  checkBackwardData();
+  checkBackwardWgts();
+
+  // clear buffers
+  // ref code will addto the diff, dnn code will writeto it
+  // and clearTopDatas(REF) should be coverd by ref layers
+  clearBotDiffs(REF);
+  clearWgtDiffs(REF);
+  // it is necessary to clear bottom diffs when only activation is dnn type
+  if (configs_[DNN].layerConfig.active_type().compare(0, 7, "mkldnn_") == 0) {
+    clearBotDiffs(DNN);
+  }
+}
+
+void MKLDNNTester::run(const TestConfig& dnn,
+                       const TestConfig& ref,
+                       size_t batchSize,
+                       size_t inputImgH,
+                       size_t inputImgW,
+                       PassType passType,
+                       bool printDetails,
+                       size_t iter,
+                       float epsilon) {
+  CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
+        dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
+      << "should be MKLDNN layer or MKLDNN activation";
+  if (dnn.layerConfig.type() == ref.layerConfig.type()) {
+    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
+                       << dnn.layerConfig.active_type() << " vs "
+                       << ref.layerConfig.active_type();
+  } else {
+    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
+                       << dnn.layerConfig.type() << " vs "
+                       << ref.layerConfig.type();
+  }
+
+  ih_ = inputImgH;
+  iw_ = inputImgW;
+  passType_ = passType;
+  log_ = printDetails;
+  iter_ = iter;
+  eps_ = epsilon;
+
+  // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
+  reset(dnn, ref, batchSize);
+  randomWgtDatas();
+  clearWgtDiffs();
+  clearBotDiffs();
+  for (size_t i = 0; i < iter_; ++i) {
+    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
+    runOnce();
+  }
+
+  if (parameters_[DNN].empty()) {
+    // has no paramters
+    return;
+  }
+
+  // After run some iterations, the mkldnn weight has been stored in dnnLayer
+  // and we can also get the mkldnn weight parameter header format.
+  // Weight parameter should always be index 0 (and bias index 1).
+  // TODO(TJ): should also consider mean and var format when batchnorm ready
+  int dnnWgtFmt = parameters_[DNN][0]->getHeaderFormat();
+  int refWgtFmt = parameters_[REF][0]->getHeaderFormat();
+  if (dnnWgtFmt == refWgtFmt) {
+    // weight format are equal, so no need check more
+    return;
+  }
+
+  // then save the weights and restart again
+  vector<VectorPtr> dnnWgts, refWgts;
+  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  saveWgt(parameters_[DNN], dnnWgts);
+  saveWgt(parameters_[REF], refWgts);
+
+  // restart again with dnn weight format
+  reset(dnn, ref, batchSize);
+  // TODO(TJ): should also considerate mean and var format when batchnorm ready
+  parameters_[DNN][0]->setHeaderFormat(dnnWgtFmt);
+
+  // restore wgt
+  restoreWgt(dnnWgts, parameters_[DNN]);
+  restoreWgt(refWgts, parameters_[REF]);
+  clearWgtDiffs();
+  clearBotDiffs();
+
+  for (size_t i = 0; i < iter_; ++i) {
+    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
+    runOnce();
+  }
+}
+
+void MKLDNNTester::initArgument(DataIn& data,
+                                const std::string& configPath,
+                                const size_t iter) {
+  TrainerConfigHelper config(configPath);
+  size_t batchSize = config.getOptConfig().batch_size();
+  data.inArgs.resize(iter);
+  data.outGrads.resize(iter);
+  data.paraValues.clear();
+  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    for (size_t i = 0; i < iter; ++i) {
+      Argument arg;
+      arg.value = Matrix::create(batchSize, layerSize, false, false);
+      arg.grad = Matrix::create(batchSize, layerSize, false, false);
+      arg.value->randomizeUniform();
+      arg.value->add(-0.5);
+      arg.value->sigmoid(*arg.value);
+      arg.grad->zeroMem();
+      arg.ids = VectorT<int>::create(batchSize, false);
+      arg.ids->rand(layerSize);
+      generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
+      data.inArgs[i].push_back(arg);
+    }
+  }
+
+  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    for (size_t i = 0; i < iter; ++i) {
+      MatrixPtr grad = Matrix::create(batchSize, layerSize, false, false);
+      grad->randomizeUniform();
+      data.outGrads[i].push_back(grad);
+    }
+  }
+
+  for (const auto& para_config : config.getModelConfig().parameters()) {
+    VectorPtr value = Vector::create(para_config.size(), false);
+    value->randnorm(0, 2);
+    data.paraValues.push_back(value);
+  }
+}
+
+void MKLDNNTester::getOutResult(const std::string& configPath,
+                                DataIn& in,
+                                DataOut& out,
+                                bool use_mkldnn,
+                                size_t iter) {
+  FLAGS_use_gpu = false;
+  FLAGS_use_mkldnn = use_mkldnn;
+  *ThreadLocalRand::getSeed() = 1;
+  srand(1);
+
+  Trainer trainer;
+  auto config = std::make_shared<TrainerConfigHelper>(configPath);
+  trainer.init(config, false);
+  auto gradientMachine = trainer.getGradientMachine();
+  std::vector<ParameterPtr> parameters = gradientMachine->getParameters();
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
+  }
+  UpdateCallback simpleUpdate = [](Parameter* para) {
+    auto& grad = para->getBuf(PARAMETER_GRADIENT);
+    auto& value = para->getBuf(PARAMETER_VALUE);
+    real lr = 1e-2;
+    value->add(*grad, lr);
+    grad->zeroMem();
+  };
+
+  vector<Argument> outArgs;
+  gradientMachine->start();
+  out.outValues.clear();
+  out.paraValues.clear();
+  for (size_t i = 0; i < iter; ++i) {
+    VLOG(MKLDNN_TESTS) << "runing iteration " << i;
+    gradientMachine->forward(in.inArgs[i], &outArgs, PASS_TRAIN);
+    // save forward result
+    for (size_t k = 0; k < outArgs.size(); k++) {
+      const MatrixPtr& src = outArgs[k].value;
+      MatrixPtr dst =
+          Matrix::create(src->getHeight(), src->getWidth(), false, false);
+      if (typeid(*src) == typeid(MKLDNNMatrix)) {
+        MKLDNNMatrixPtr dnnSrc = std::dynamic_pointer_cast<MKLDNNMatrix>(src);
+        dnnSrc->copyTo(*dst);
+      } else {
+        dst->copyFrom(*src);
+      }
+      out.outValues.push_back(dst);
+    }
+
+    // random backward input
+    for (size_t k = 0; k < outArgs.size(); k++) {
+      outArgs[k].grad->copyFrom(*in.outGrads[i][k]);
+    }
+    gradientMachine->backward(simpleUpdate);
+  }
+  gradientMachine->finish();
+
+  // save param value
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    VectorPtr val = Vector::create(
+        parameters[i]->getBuf(PARAMETER_VALUE)->getSize(), false);
+    val->copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
+    out.paraValues.push_back(val);
+  }
+}
+
+void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
+  CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
+  CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
+  for (size_t i = 0; i < ref.outValues.size(); i++) {
+    VLOG(MKLDNN_TESTS) << "compare value index: " << i;
+    EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
+  }
+  for (size_t i = 0; i < ref.paraValues.size(); i++) {
+    VLOG(MKLDNN_TESTS) << "compare param index: " << i;
+    EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
+  }
+}
+
+void MKLDNNTester::runNetTest(const std::string& configPath,
+                              size_t iter,
+                              float eps) {
+  DataIn in;
+  initArgument(in, configPath, iter);
+  DataOut outCpu, outDnn;
+  VLOG(MKLDNN_TESTS) << "runing cpu network";
+  getOutResult(configPath, in, outCpu, false, iter);
+  VLOG(MKLDNN_TESTS) << "runing mkldnn network";
+  getOutResult(configPath, in, outDnn, true, iter);
+
+  compareResult(outCpu, outDnn, eps);
+}
+
+}  //  namespace paddle
diff --git a/paddle/legacy/gserver/tests/MKLDNNTester.h b/paddle/legacy/gserver/tests/MKLDNNTester.h
new file mode 100644
index 0000000000000000000000000000000000000000..086846ce537857eb76ffca492246677eb7982a42
--- /dev/null
+++ b/paddle/legacy/gserver/tests/MKLDNNTester.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "LayerGradUtil.h"
+#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
+#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
+
+namespace paddle {
+
+/**
+ * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
+ * refer to paddle original function
+ */
+class MKLDNNTester {
+  enum {
+    DNN = 0,  // MKLDNN layer
+    REF = 1,  // Reference layer
+    NUM = 2,  // Number of total
+  };
+
+  struct DataIn {
+    std::vector<std::vector<Argument>> inArgs;
+    std::vector<std::vector<MatrixPtr>> outGrads;
+    std::vector<VectorPtr> paraValues;
+  };
+
+  struct DataOut {
+    std::vector<MatrixPtr> outValues;
+    std::vector<VectorPtr> paraValues;
+  };
+
+ protected:
+  std::vector<TestConfig> configs_;
+  vector<string> layerNames_;
+  vector<vector<DataLayerPtr>> dataLayers_;
+  vector<vector<Argument>> datas_;
+  vector<LayerMap> layerMaps_;
+  vector<vector<ParameterPtr>> parameters_;
+  vector<LayerPtr> testLayers_;
+  LayerPtr refLayer_, dnnLayer_;
+
+  /// run some iterations, all the result should pass
+  size_t iter_;
+  /// whether to print out the details
+  bool log_;
+  /// epsilon
+  float eps_;
+  /// input image size, default 1
+  size_t ih_, iw_;
+  /// passType, PASS_TRAIN, PASS_TEST or PASS_GC (Gradient Check pass)
+  PassType passType_;
+
+ public:
+  explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
+    iter_ = iter;
+    eps_ = epsilon;
+    log_ = false;
+    passType_ = PASS_TRAIN;
+  }
+
+  ~MKLDNNTester() {}
+
+ public:
+  void run(const TestConfig& dnn,
+           const TestConfig& ref,
+           size_t batchSize,
+           size_t inputImgH = 1,
+           size_t inputImgW = 1,
+           PassType passType = PASS_TRAIN,
+           bool printDetails = false,
+           size_t iter = 3,
+           float epsilon = 1e-4);
+  static void runNetTest(const std::string& configPath,
+                         size_t iter = 2,
+                         float eps = 1e-4);
+  static void initArgument(DataIn& data,
+                           const std::string& configPath,
+                           size_t iter = 2);
+  static void getOutResult(const std::string& configPath,
+                           DataIn& in,
+                           DataOut& out,
+                           bool use_mkldnn,
+                           size_t iter = 2);
+
+ private:
+  void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
+  void setInputImgSize();
+  void runOnce();
+
+  void randomWgtDatas();
+  void randomBotDatas();
+  void randomTopDiffs();
+
+  void checkForward();
+  void checkBackwardData();
+  void checkBackwardWgts();
+
+  // clear specific layer, clear all when id equals NUM
+  void clearWgtDiffs(size_t id = NUM);
+  void clearBotDiffs(size_t id = NUM);
+  void clearTopDatas(size_t id = NUM);
+
+  void printTopDatas();
+  void printMatrix(const MatrixPtr& m);
+  void printVector(const VectorPtr& v);
+
+  void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
+  void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
+
+  static double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
+  static double compareVector(const VectorPtr& v1, const VectorPtr& v2);
+  static void compareResult(DataOut& ref, DataOut& dnn, float eps = 1e-4);
+
+  /**
+   * Get delta percent
+   * if many(>failRate) wrong(abs(val-ref)/abs(ref) > thres) points
+   * return the max(diff/ref)
+   * else return sum(abs(diff)) / sum(abs(ref))
+   * The return value should be smaller than eps when passing.
+   */
+  static double getDelta(const real* refer,
+                         const real* value,
+                         size_t len,
+                         const float failRate = 1e-3,
+                         const float thres = 0.1);
+};
+
+}  //  namespace paddle
diff --git a/paddle/gserver/tests/Sequence/dummy.list b/paddle/legacy/gserver/tests/Sequence/dummy.list
similarity index 100%
rename from paddle/gserver/tests/Sequence/dummy.list
rename to paddle/legacy/gserver/tests/Sequence/dummy.list
diff --git a/paddle/gserver/tests/Sequence/tour_dict_phrase.dict b/paddle/legacy/gserver/tests/Sequence/tour_dict_phrase.dict
similarity index 100%
rename from paddle/gserver/tests/Sequence/tour_dict_phrase.dict
rename to paddle/legacy/gserver/tests/Sequence/tour_dict_phrase.dict
diff --git a/paddle/gserver/tests/Sequence/tour_train_wdseg b/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
similarity index 100%
rename from paddle/gserver/tests/Sequence/tour_train_wdseg
rename to paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
diff --git a/paddle/gserver/tests/Sequence/tour_train_wdseg.nest b/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
similarity index 100%
rename from paddle/gserver/tests/Sequence/tour_train_wdseg.nest
rename to paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
diff --git a/paddle/legacy/gserver/tests/Sequence/train.list b/paddle/legacy/gserver/tests/Sequence/train.list
new file mode 100644
index 0000000000000000000000000000000000000000..1109a2449252cb9bfcb10ece4cf9a96e655e5a25
--- /dev/null
+++ b/paddle/legacy/gserver/tests/Sequence/train.list
@@ -0,0 +1 @@
+legacy/gserver/tests/Sequence/tour_train_wdseg
diff --git a/paddle/legacy/gserver/tests/Sequence/train.list.nest b/paddle/legacy/gserver/tests/Sequence/train.list.nest
new file mode 100644
index 0000000000000000000000000000000000000000..a67df35024f456d517899f37272b0f74d822f03d
--- /dev/null
+++ b/paddle/legacy/gserver/tests/Sequence/train.list.nest
@@ -0,0 +1 @@
+legacy/gserver/tests/Sequence/tour_train_wdseg.nest
diff --git a/paddle/gserver/tests/__init__.py b/paddle/legacy/gserver/tests/__init__.py
similarity index 100%
rename from paddle/gserver/tests/__init__.py
rename to paddle/legacy/gserver/tests/__init__.py
diff --git a/paddle/gserver/tests/concat_dotmul_a.conf b/paddle/legacy/gserver/tests/concat_dotmul_a.conf
similarity index 100%
rename from paddle/gserver/tests/concat_dotmul_a.conf
rename to paddle/legacy/gserver/tests/concat_dotmul_a.conf
diff --git a/paddle/gserver/tests/concat_dotmul_b.conf b/paddle/legacy/gserver/tests/concat_dotmul_b.conf
similarity index 100%
rename from paddle/gserver/tests/concat_dotmul_b.conf
rename to paddle/legacy/gserver/tests/concat_dotmul_b.conf
diff --git a/paddle/gserver/tests/concat_fullmatrix_a.conf b/paddle/legacy/gserver/tests/concat_fullmatrix_a.conf
similarity index 100%
rename from paddle/gserver/tests/concat_fullmatrix_a.conf
rename to paddle/legacy/gserver/tests/concat_fullmatrix_a.conf
diff --git a/paddle/gserver/tests/concat_fullmatrix_b.conf b/paddle/legacy/gserver/tests/concat_fullmatrix_b.conf
similarity index 100%
rename from paddle/gserver/tests/concat_fullmatrix_b.conf
rename to paddle/legacy/gserver/tests/concat_fullmatrix_b.conf
diff --git a/paddle/gserver/tests/concat_slice_a.conf b/paddle/legacy/gserver/tests/concat_slice_a.conf
similarity index 100%
rename from paddle/gserver/tests/concat_slice_a.conf
rename to paddle/legacy/gserver/tests/concat_slice_a.conf
diff --git a/paddle/gserver/tests/concat_slice_b.conf b/paddle/legacy/gserver/tests/concat_slice_b.conf
similarity index 100%
rename from paddle/gserver/tests/concat_slice_b.conf
rename to paddle/legacy/gserver/tests/concat_slice_b.conf
diff --git a/paddle/gserver/tests/concat_table_a.conf b/paddle/legacy/gserver/tests/concat_table_a.conf
similarity index 100%
rename from paddle/gserver/tests/concat_table_a.conf
rename to paddle/legacy/gserver/tests/concat_table_a.conf
diff --git a/paddle/gserver/tests/concat_table_b.conf b/paddle/legacy/gserver/tests/concat_table_b.conf
similarity index 100%
rename from paddle/gserver/tests/concat_table_b.conf
rename to paddle/legacy/gserver/tests/concat_table_b.conf
diff --git a/paddle/gserver/tests/img_conv_a.conf b/paddle/legacy/gserver/tests/img_conv_a.conf
similarity index 100%
rename from paddle/gserver/tests/img_conv_a.conf
rename to paddle/legacy/gserver/tests/img_conv_a.conf
diff --git a/paddle/gserver/tests/img_conv_b.conf b/paddle/legacy/gserver/tests/img_conv_b.conf
similarity index 100%
rename from paddle/gserver/tests/img_conv_b.conf
rename to paddle/legacy/gserver/tests/img_conv_b.conf
diff --git a/paddle/gserver/tests/img_conv_c.conf b/paddle/legacy/gserver/tests/img_conv_c.conf
similarity index 100%
rename from paddle/gserver/tests/img_conv_c.conf
rename to paddle/legacy/gserver/tests/img_conv_c.conf
diff --git a/paddle/gserver/tests/img_conv_cudnn.py b/paddle/legacy/gserver/tests/img_conv_cudnn.py
similarity index 100%
rename from paddle/gserver/tests/img_conv_cudnn.py
rename to paddle/legacy/gserver/tests/img_conv_cudnn.py
diff --git a/paddle/gserver/tests/img_conv_exconv.py b/paddle/legacy/gserver/tests/img_conv_exconv.py
similarity index 100%
rename from paddle/gserver/tests/img_conv_exconv.py
rename to paddle/legacy/gserver/tests/img_conv_exconv.py
diff --git a/paddle/gserver/tests/img_pool_a.conf b/paddle/legacy/gserver/tests/img_pool_a.conf
similarity index 100%
rename from paddle/gserver/tests/img_pool_a.conf
rename to paddle/legacy/gserver/tests/img_pool_a.conf
diff --git a/paddle/gserver/tests/img_pool_b.conf b/paddle/legacy/gserver/tests/img_pool_b.conf
similarity index 100%
rename from paddle/gserver/tests/img_pool_b.conf
rename to paddle/legacy/gserver/tests/img_pool_b.conf
diff --git a/paddle/gserver/tests/mkldnn_branch_net.conf b/paddle/legacy/gserver/tests/mkldnn_branch_net.conf
similarity index 100%
rename from paddle/gserver/tests/mkldnn_branch_net.conf
rename to paddle/legacy/gserver/tests/mkldnn_branch_net.conf
diff --git a/paddle/gserver/tests/mkldnn_simple_net.conf b/paddle/legacy/gserver/tests/mkldnn_simple_net.conf
similarity index 100%
rename from paddle/gserver/tests/mkldnn_simple_net.conf
rename to paddle/legacy/gserver/tests/mkldnn_simple_net.conf
diff --git a/paddle/gserver/tests/pyDataProvider.py b/paddle/legacy/gserver/tests/pyDataProvider.py
similarity index 100%
rename from paddle/gserver/tests/pyDataProvider.py
rename to paddle/legacy/gserver/tests/pyDataProvider.py
diff --git a/paddle/gserver/tests/pyDataProvider/pyDataProviderList b/paddle/legacy/gserver/tests/pyDataProvider/pyDataProviderList
similarity index 100%
rename from paddle/gserver/tests/pyDataProvider/pyDataProviderList
rename to paddle/legacy/gserver/tests/pyDataProvider/pyDataProviderList
diff --git a/paddle/gserver/tests/pyDataProvider/trainer.conf b/paddle/legacy/gserver/tests/pyDataProvider/trainer.conf
similarity index 100%
rename from paddle/gserver/tests/pyDataProvider/trainer.conf
rename to paddle/legacy/gserver/tests/pyDataProvider/trainer.conf
diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/legacy/gserver/tests/rnn_data_provider.py
similarity index 100%
rename from paddle/gserver/tests/rnn_data_provider.py
rename to paddle/legacy/gserver/tests/rnn_data_provider.py
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/legacy/gserver/tests/sequenceGen.py
similarity index 100%
rename from paddle/gserver/tests/sequenceGen.py
rename to paddle/legacy/gserver/tests/sequenceGen.py
diff --git a/paddle/legacy/gserver/tests/sequence_layer_group.conf b/paddle/legacy/gserver/tests/sequence_layer_group.conf
new file mode 100644
index 0000000000000000000000000000000000000000..ad1b61d5821fd20135e61bb95abdea16d27a6a9a
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_layer_group.conf
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# (lstm_input + lstm) is equal to lstmemory 
+with mixed_layer(size=hidden_dim * 4) as lstm_input:
+    lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory_group(
+    input=lstm_input,
+    size=hidden_dim,
+    act=TanhActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=TanhActivation())
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_lstm.conf b/paddle/legacy/gserver/tests/sequence_lstm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..6ab70e70713f31de31b5cd544cf132e7d0af0f2f
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_lstm.conf
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+sparse_update = get_config_arg("sparse_update", bool, False)
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data,
+    size=word_dim,
+    param_attr=ParamAttr(sparse_update=sparse_update))
+
+with mixed_layer(size=hidden_dim * 4) as lstm_input:
+    lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory(
+    input=lstm_input,
+    act=TanhActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=TanhActivation())
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf b/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
new file mode 100644
index 0000000000000000000000000000000000000000..75c36b118979760e034f81e3127a748651f53347
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list.nest',
+    test_list=None,
+    module='sequenceGen',
+    obj='process2',
+    args={"dict_file": dict_file})
+
+settings(batch_size=2)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb_group = embedding_layer(input=data, size=word_dim)
+
+
+# (lstm_input + lstm) is equal to lstmemory 
+def lstm_group(lstm_group_input):
+    with mixed_layer(size=hidden_dim * 4) as group_input:
+        group_input += full_matrix_projection(input=lstm_group_input)
+
+    lstm_output = lstmemory_group(
+        input=group_input,
+        name="lstm_group",
+        size=hidden_dim,
+        act=TanhActivation(),
+        gate_act=SigmoidActivation(),
+        state_act=TanhActivation())
+    return lstm_output
+
+
+lstm_nest_group = recurrent_group(
+    input=SubsequenceInput(emb_group), step=lstm_group, name="lstm_nest_group")
+# hasSubseq ->(seqlastins) seq
+lstm_last = last_seq(
+    input=lstm_nest_group, agg_level=AggregateLevel.TO_SEQUENCE)
+
+# seq ->(expand) hasSubseq
+lstm_expand = expand_layer(
+    input=lstm_last,
+    expand_as=emb_group,
+    expand_level=ExpandLevel.FROM_SEQUENCE)
+
+# hasSubseq ->(average) seq
+lstm_average = pooling_layer(
+    input=lstm_expand,
+    pooling_type=AvgPooling(),
+    agg_level=AggregateLevel.TO_SEQUENCE)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_average)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn.conf b/paddle/legacy/gserver/tests/sequence_nest_rnn.conf
new file mode 100644
index 0000000000000000000000000000000000000000..bc3b22c2a946a62c7a9d3163d3863a090d63539c
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_nest_rnn.conf
@@ -0,0 +1,74 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_subseq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn.conf
+
+def outer_step(x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y):
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        out = fc_layer(input=[y, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+        return out
+
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        name="inner",
+        input=x)
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+    # "return last" won't work, because recurrent_group only support the input 
+    # sequence type is same as return sequence type.
+    return inner_rnn_output
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=SubsequenceInput(emb))
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf
new file mode 100644
index 0000000000000000000000000000000000000000..165ab229897d32ce2cae1d483b3ffd81392a355a
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf
@@ -0,0 +1,76 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_subseq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn.conf
+
+def outer_step(wid, x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y, wid):
+        z = embedding_layer(input=wid, size=word_dim)
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        out = fc_layer(input=[y, z, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+        return out
+
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        name="inner",
+        input=[x, wid])
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+    # "return last" should also work. But currently RecurrentGradientMachine
+    # does not handle it, and will report error: In hierachical RNN, all out
+    # links should be from sequences now.
+    return inner_rnn_output
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[SubsequenceInput(data), SubsequenceInput(emb)])
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a48b7f25c454b492d20e807f09f6d788af44681
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
@@ -0,0 +1,96 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_unequalength_subseq')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 2
+
+speaker1 = data_layer(name="word1", size=dict_dim)
+speaker2 = data_layer(name="word2", size=dict_dim)
+
+emb1 = embedding_layer(input=speaker1, size=word_dim)
+emb2 = embedding_layer(input=speaker2, size=word_dim)
+
+
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_multi_unequalength_inputs.conf
+def outer_step(x1, x2):
+    index = [0]
+
+    def inner_step(ipt):
+        index[0] += 1
+        i = index[0]
+        outer_mem = memory(name="outer_rnn_state_%d" % i, size=hidden_dim)
+
+        def inner_step_impl(y):
+            inner_mem = memory(
+                name="inner_rnn_state_" + y.name,
+                size=hidden_dim,
+                boot_layer=outer_mem)
+            out = fc_layer(
+                input=[y, inner_mem],
+                size=hidden_dim,
+                act=TanhActivation(),
+                bias_attr=True,
+                name='inner_rnn_state_' + y.name)
+            return out
+
+        encoder = recurrent_group(
+            step=inner_step_impl, name='inner_%d' % i, input=ipt)
+        last = last_seq(name="outer_rnn_state_%d" % i, input=encoder)
+        return encoder, last
+
+    encoder1, sentence_last_state1 = inner_step(ipt=x1)
+    encoder2, sentence_last_state2 = inner_step(ipt=x2)
+
+    encoder1_expand = expand_layer(
+        input=sentence_last_state1, expand_as=encoder2)
+
+    return [encoder1_expand, encoder2]
+
+
+encoder1_rep, encoder2_rep = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
+    targetInlink=emb2)
+
+encoder1_last = last_seq(input=encoder1_rep)
+encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
+context = mixed_layer(
+    input=[
+        identity_projection(encoder1_expandlast),
+        identity_projection(encoder2_rep)
+    ],
+    size=hidden_dim)
+
+rep = last_seq(input=context)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(
+    classification_cost(
+        input=prob, label=data_layer(
+            name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_recurrent.py b/paddle/legacy/gserver/tests/sequence_recurrent.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2c6a7935c28838fb12fc6e44d99dd59636bf7dd
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_recurrent.py
@@ -0,0 +1,55 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent_group.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_recurrent_group.py b/paddle/legacy/gserver/tests/sequence_recurrent_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4638bd9075ff5cdd4a5ed1bc0e0d133f9a9ab86
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_recurrent_group.py
@@ -0,0 +1,68 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    with mixed_layer(
+            name="rnn_state",
+            size=hidden_dim,
+            bias_attr=False,
+            act=SoftmaxActivation()) as out:
+        out += identity_projection(input=y)
+        out += full_matrix_projection(
+            input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
+    return out
+
+
+recurrent = recurrent_group(name="rnn", step=step, input=emb)
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn.conf b/paddle/legacy/gserver/tests/sequence_rnn.conf
new file mode 100644
index 0000000000000000000000000000000000000000..3133595c9ce4c25683c06d326a5ebe9d2bf13077
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn.conf
@@ -0,0 +1,57 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_seq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    out = fc_layer(input=[y, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+    return out
+
+out = recurrent_group(
+    name="rnn",
+    step=step,
+    input=emb)
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..921cef04dda0da396a79592b09d7a7e7177462d5
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py
@@ -0,0 +1,84 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_mixed')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 2
+hidden_dim = 2
+label_dim = 2
+
+data1 = data_layer(name="word1", size=dict_dim)
+data2 = data_layer(name="word2", size=dict_dim)
+label = data_layer(name="label", size=label_dim)
+
+encoding = embedding_layer(input=data2, size=word_dim)
+
+subseq = embedding_layer(input=data1, size=word_dim)
+seq = embedding_layer(input=data2, size=word_dim)
+nonseq = embedding_layer(input=label, size=word_dim)
+
+
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_mixed_inputs.conf
+def outer_step(subseq, seq, nonseq, encoding):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+
+    def inner_step(subseq, seq, nonseq):
+        inner_mem = memory(
+            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
+
+        out = fc_layer(
+            input=[subseq, seq, nonseq, inner_mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='inner_rnn_state')
+        return out
+
+    decoder = recurrent_group(
+        step=inner_step, name='inner', input=[subseq, seq, nonseq])
+    last = last_seq(name="outer_rnn_state", input=decoder)
+    context = simple_attention(
+        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
+    return context
+
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[
+        subseq, expand_layer(
+            seq, expand_as=subseq,
+            expand_level=ExpandLevel.FROM_SEQUENCE), expand_layer(
+                nonseq,
+                expand_as=subseq,
+                expand_level=ExpandLevel.FROM_NO_SEQUENCE),
+        StaticInput(encoding)
+    ])
+
+rep = last_seq(input=out)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7bcaf6c4b21272e1c95d6de7e69e4558d52b9c6
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py
@@ -0,0 +1,78 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_mixed')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 2
+hidden_dim = 2
+label_dim = 2
+
+data1 = data_layer(name="word1", size=dict_dim)
+data2 = data_layer(name="word2", size=dict_dim)
+label = data_layer(name="label", size=label_dim)
+
+encoding = embedding_layer(input=data2, size=word_dim)
+
+
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_matched_inputs.conf
+def outer_step(subseq, seq, nonseq, encoding):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+
+    def inner_step(data1, data2, label):
+        inner_mem = memory(
+            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
+
+        subseq = embedding_layer(input=data1, size=word_dim)
+        seq = embedding_layer(input=data2, size=word_dim)
+        nonseq = embedding_layer(input=label, size=word_dim)
+
+        print_layer(input=[data1, seq, label, inner_mem])
+        out = fc_layer(
+            input=[subseq, seq, nonseq, inner_mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='inner_rnn_state')
+        return out
+
+    decoder = recurrent_group(
+        step=inner_step, name='inner',
+        input=[subseq, StaticInput(seq), nonseq])
+    last = last_seq(name="outer_rnn_state", input=decoder)
+    context = simple_attention(
+        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
+    return context
+
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[data1, data2, StaticInput(label), StaticInput(encoding)])
+
+rep = last_seq(input=out)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf b/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf
new file mode 100644
index 0000000000000000000000000000000000000000..bf4be779a23e081cef33ce2b2734ad91cfa33c0d
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf
@@ -0,0 +1,58 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_seq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+def step(y, wid):
+    z = embedding_layer(input=wid, size=word_dim)
+    mem = memory(name="rnn_state", size=hidden_dim)
+    out = fc_layer(input=[y, z, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+    return out
+
+out = recurrent_group(
+    name="rnn",
+    step=step,
+    input=[emb, data])
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..3612b49c2279874a378d4aaed81623f7d0d2ea2f
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
@@ -0,0 +1,76 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_unequalength_seq')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 2
+
+speaker1 = data_layer(name="word1", size=dict_dim)
+speaker2 = data_layer(name="word2", size=dict_dim)
+
+emb1 = embedding_layer(input=speaker1, size=word_dim)
+emb2 = embedding_layer(input=speaker2, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the RNN in
+# sequence_nest_rnn_multi_unequalength_inputs.conf
+
+
+def step(x1, x2):
+    def calrnn(y):
+        mem = memory(name='rnn_state_' + y.name, size=hidden_dim)
+        out = fc_layer(
+            input=[y, mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='rnn_state_' + y.name)
+        return out
+
+    encoder1 = calrnn(x1)
+    encoder2 = calrnn(x2)
+    return [encoder1, encoder2]
+
+
+encoder1_rep, encoder2_rep = recurrent_group(
+    name="stepout", step=step, input=[emb1, emb2])
+
+encoder1_last = last_seq(input=encoder1_rep)
+encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
+context = mixed_layer(
+    input=[
+        identity_projection(encoder1_expandlast),
+        identity_projection(encoder2_rep)
+    ],
+    size=hidden_dim)
+
+rep = last_seq(input=context)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(
+    classification_cost(
+        input=prob, label=data_layer(
+            name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/test_ActivationGrad.cpp b/paddle/legacy/gserver/tests/test_ActivationGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f468d229a889e02bf79baa29576c638acbd8eb08
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_ActivationGrad.cpp
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+void testActivation(const string& act) {
+  LOG(INFO) << "test activation: " << act;
+  size_t size = 10;
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type(act);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  act + "_activation",
+                  100,
+                  /* trans= */ false,
+                  useGpu,
+                  /* useWeight */ true);
+  }
+}
+
+TEST(Activation, activation) {
+  auto types = ActivationFunction::getAllRegisteredTypes();
+  std::set<string> excluded{"sequence_softmax"};
+  for (auto type : types) {
+    if (excluded.count(type)) continue;
+    testActivation(type);
+  }
+}
+
+void testSequenceSoftmaxAct(bool hasSubseq) {
+  LOG(INFO) << "test activation: sequence softmax";
+
+  const size_t size = 1;
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sequence_softmax");
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       1,
+       0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "sequence_softmax",
+                  100,
+                  /* trans= */ false,
+                  useGpu,
+                  /* useWeight */ true);
+  }
+}
+
+TEST(SequenceSoftmaxActivation, activation) {
+  for (auto hasSubseq : {false, true}) {
+    LOG(INFO) << "hasSubseq = " << hasSubseq;
+    testSequenceSoftmaxAct(hasSubseq);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_BatchNorm.cpp b/paddle/legacy/gserver/tests/test_BatchNorm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7a65a30510e225e2cfeabb7c851a4533771d44a
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_BatchNorm.cpp
@@ -0,0 +1,195 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/legacy/cuda/include/hl_batch_norm.h"
+#include "paddle/legacy/math/tests/TensorCheck.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
+
+// Test that the batchNormLayer can be followed by a ConvLayer
+TEST(Layer, batchNorm) {
+  FLAGS_use_gpu = false;
+  TestConfig configBN;
+  const int CHANNELS = 6272;
+  const int IMG_SIZE = 1;
+  configBN.layerConfig.set_type("batch_norm");
+  configBN.layerConfig.set_name("bn");
+  configBN.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
+  configBN.layerConfig.set_active_type("relu");
+  configBN.biasSize = CHANNELS;
+  configBN.inputDefs.push_back({INPUT_DATA,
+                                "layer_0",
+                                /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
+                                /* paraSize= */ CHANNELS});
+
+  configBN.inputDefs.push_back(
+      {INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  configBN.inputDefs.back().isStatic = true;
+  configBN.inputDefs.push_back(
+      {INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  configBN.inputDefs.back().isStatic = true;
+
+  LayerInputConfig* input = configBN.layerConfig.add_inputs();
+  configBN.layerConfig.add_inputs();
+  configBN.layerConfig.add_inputs();
+
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+
+  // Setting up conv-layer config
+  TestConfig config;
+  config.biasSize = 64;
+  config.layerConfig.set_type("exconv");
+  config.layerConfig.set_num_filters(64);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "bn", 6272, 204800});
+  input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(5);
+  conv->set_filter_size_y(5);
+  conv->set_channels(128);
+  conv->set_padding(1);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(7);
+  conv->set_output_x(3);
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(configBN,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "batch_norm",
+                100,
+                false,
+                false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr bnLayer;
+  initTestLayer(configBN, &layerMap, &parameters, &bnLayer);
+
+  std::vector<ParameterPtr> parameters2;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap, &parameters2, &convLayer);
+
+  bnLayer->forward(PASS_GC);
+  convLayer->forward(PASS_GC);
+
+  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getHeight()), 100);
+  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
+}
+
+#ifdef PADDLE_WITH_CUDA
+void batchNormInference(int n, int c, int h, int w) {
+  MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudaOut = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudnnCheck = std::make_shared<CpuMatrix>(n, c * h * w);
+  MatrixPtr cudaCheck = std::make_shared<CpuMatrix>(n, c * h * w);
+  input->randomizeUniform();
+  cudnnOut->zeroMem();
+  cudaOut->zeroMem();
+
+  MatrixPtr scale = std::make_shared<GpuMatrix>(1, c);
+  scale->randomizeUniform();
+  MatrixPtr bias = std::make_shared<GpuMatrix>(1, c);
+  bias->randomizeUniform();
+
+  MatrixPtr movingMean = std::make_shared<GpuMatrix>(1, c);
+  movingMean->randomizeUniform();
+
+  MatrixPtr movingVar = std::make_shared<GpuMatrix>(1, c);
+  movingVar->randomizeUniform();
+  movingVar->clip(0.01, 50);
+
+  hl_tensor_descriptor ioDesc;
+  hl_tensor_descriptor bnDesc;
+  hl_create_tensor_descriptor(&ioDesc);
+  hl_create_tensor_descriptor(&bnDesc);
+  hl_tensor_reshape(ioDesc, n, c, h, w);
+  hl_tensor_reshape(bnDesc, 1, c, 1, 1);
+
+  double EPS = 1E-5;
+  hl_batch_norm_forward_inference(ioDesc,
+                                  input->getData(),
+                                  ioDesc,
+                                  cudnnOut->getData(),
+                                  bnDesc,
+                                  scale->getData(),
+                                  bias->getData(),
+                                  movingMean->getData(),
+                                  movingVar->getData(),
+                                  EPS);
+
+  hl_batch_norm_cuda_inference(input->getData(),
+                               cudaOut->getData(),
+                               scale->getData(),
+                               bias->getData(),
+                               movingMean->getData(),
+                               movingVar->getData(),
+                               EPS,
+                               n,
+                               c,
+                               h,
+                               w);
+
+  cudnnCheck->copyFrom(*cudnnOut);
+  cudaCheck->copyFrom(*cudaOut);
+  autotest::TensorCheckErr(*cudnnCheck, *cudaCheck);
+
+  hl_destroy_tensor_descriptor(ioDesc);
+  hl_destroy_tensor_descriptor(bnDesc);
+}
+
+TEST(BatchNorm, Inference) {
+  batchNormInference(33, 267, 1, 1);
+  batchNormInference(19, 105, 4, 4);
+}
+#endif
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp b/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1dafd1de4d82f1d306626090c30cf9203fa24dd0
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/LinearChainCRF.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+static inline bool getNextSequence(std::vector<int>& seq, int numClasses) {
+  for (auto& v : seq) {
+    if (++v < numClasses) {
+      return true;
+    }
+    v = 0;
+  }
+  return false;
+}
+
+// log(exp(x) + exp(y))
+static inline real logSum(real x, real y) {
+  real maxValue = std::max(x, y);
+  if (std::isinf(maxValue)) {
+    return -std::numeric_limits<real>::infinity();
+  } else {
+    return maxValue + log(exp(x - maxValue) + exp(y - maxValue));
+  }
+}
+
+static inline std::vector<int> genRandLabels(int numClasses, int length) {
+  std::vector<int> labels(length);
+  for (int i = 0; i < length; ++i) {
+    labels[i] = rand() % numClasses;  // NOLINT
+  }
+  return labels;
+}
+
+TEST(CRFLayer, cost) {
+  const int numClasses = 4;
+  CpuVector para(numClasses * (numClasses + 2));
+  real* a = para.getData();
+  real* b = para.getData() + numClasses;
+  real* w = para.getData() + 2 * numClasses;
+  LinearChainCRF crf(4, para.getData());
+  for (int length : {1, 2, 3, 10}) {
+    for (int tries = 0; tries < 10; ++tries) {
+      CpuMatrix x(length, numClasses);
+      x.randomizeUniform();
+      para.randnorm(0, 2);
+
+      std::vector<int> goldenLabels = genRandLabels(numClasses, length);
+
+      real cost = crf.forward(x.getData(), goldenLabels.data(), length);
+
+      real logZ = -std::numeric_limits<real>::infinity();
+      real logNominator = -std::numeric_limits<real>::infinity();
+      std::vector<int> testResult(length, 0);
+      do {
+        real score = a[testResult.front()];
+        score += x.getElement(0, testResult.front());
+        for (int k = 1; k < length; ++k) {
+          score += x.getElement(k, testResult[k]) +
+                   w[numClasses * testResult[k - 1] + testResult[k]];
+        }
+        score += b[testResult.back()];
+        logZ = logSum(logZ, score);
+
+        if (goldenLabels == testResult) {
+          logNominator = score;
+        }
+      } while (getNextSequence(testResult, numClasses));
+
+      real trueCost = -logNominator + logZ;
+
+      real diff = fabs(trueCost - cost);
+      diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost);
+      VLOG(1) << "cost=" << cost << " trueCost=" << trueCost << " diff=" << diff
+              << std::endl;
+      if (typeid(real) == typeid(double)) {  // NOLINT
+        EXPECT_LE(diff, 1e-10);
+      } else {
+        EXPECT_LE(diff, 5e-3);
+      }
+    }
+  }
+}
+
+inline real epsilon() { return typeid(real) == typeid(double) ? 1e-10 : 0.06; }
+
+TestConfig initTestConfig(size_t numClasses, bool withWeight) {
+  TestConfig config;
+  config.layerConfig.set_type("crf");
+  config.layerConfig.set_size(numClasses);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
+                              "layer_0",
+                              numClasses,
+                              numClasses * (numClasses + 2)});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_LABEL, "layer_label", numClasses, 0});
+  config.layerConfig.add_inputs();
+
+  if (withWeight) {
+    config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", 1, 0});
+    config.layerConfig.add_inputs();
+  }
+
+  return config;
+}
+
+TEST(Layer, CRFLayer) {
+  size_t numClasses = 10;
+  for (int tries = 0; tries < 5; ++tries) {
+    TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
+    for (int length : {1, 3, 100}) {
+      // Not support GPU now
+      testLayerGrad(config,
+                    "crf",
+                    length,
+                    /* trans= */ false,
+                    /* useGpu= */ false,
+                    /* useWeight= */ false,
+                    epsilon());
+    }
+  }
+}
+
+TEST(Layer, CRFLayerUseWeight) {
+  size_t numClasses = 10;
+  for (int tries = 0; tries < 5; ++tries) {
+    TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
+    for (int length : {1, 3, 100}) {
+      // Not support GPU now
+      testLayerGrad(config,
+                    "crf",
+                    length,
+                    /* trans= */ false,
+                    /* useGpu= */ false,
+                    /* useWeight= */ false,
+                    epsilon());
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_CompareSparse.cpp b/paddle/legacy/gserver/tests/test_CompareSparse.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..51433c9aaaec3d195a5591f20103bfa66cd4e4ea
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_CompareSparse.cpp
@@ -0,0 +1,228 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/utils/PythonUtil.h>
+
+#include "paddle/trainer/Trainer.h"
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/pserver/ParameterServer2.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& configFile1 = "legacy/gserver/tests/sequence_lstm.conf";
+
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_int32(seed);
+DECLARE_int32(num_passes);
+DECLARE_int32(saving_period);
+
+DECLARE_int32(num_gradient_servers);
+DECLARE_int32(port);
+DECLARE_bool(local);
+DECLARE_bool(use_old_updater);
+DECLARE_bool(parallel_nn);
+DECLARE_string(config_args);
+DEFINE_double(max_diff_ratio,
+              0.0f,
+              "max diff ratio allowed for parameters value");
+
+int gNumDevices = 0;
+
+std::vector<ParameterPtr> trainerOnePassTest(const string& configFile,
+                                             bool sparseUpdate,
+                                             int trainerCount = 1,
+                                             bool useGpu = false) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_config = configFile;
+  FLAGS_trainer_count = trainerCount;
+  FLAGS_config_args = sparseUpdate ? "sparse_update=1" : "sparse_update=0";
+
+  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
+            << " configFile=" << configFile << " sparseUpdate=" << sparseUpdate;
+  srand(FLAGS_seed);
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  ThreadLocalRandomEngine::get().seed(FLAGS_seed);
+  if (useGpu) {
+    CHECK_LE(trainerCount, gNumDevices);
+  }
+
+  std::vector<std::shared_ptr<ParameterServer2>> pservers;
+  if (!FLAGS_local) {
+    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
+    pservers.resize(numPorts);
+
+    for (int i = 0; i < numPorts; ++i) {
+      pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
+      pservers[i]->init();
+      pservers[i]->start();
+    }
+  }
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig());
+  trainer.train();
+  return trainer.getGradientMachine()->getParameters();
+}
+
+std::vector<ParameterPtr>& getDenseParameters() {
+  static std::vector<ParameterPtr> denseParameters;
+  if (denseParameters.empty()) {
+    // use dense training as base
+    FLAGS_local = true;
+    denseParameters = trainerOnePassTest(configFile1, false);
+  }
+
+  return denseParameters;
+}
+
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 double maxDiffRatio) {
+  double maxDiff = 0;
+  double maxValue = 0;
+  for (size_t i = 0; i < len; ++i) {
+    double diff = fabs(A[i] - B[i]);
+    maxValue = std::max<double>(maxValue, std::max(fabs(A[i]), fabs(B[i])));
+    maxDiff = std::max(maxDiff, diff);
+  }
+  EXPECT_LE(maxDiff / maxValue, maxDiffRatio);
+  LOG(INFO) << " maxDiff=" << maxDiff << " maxValue=" << maxValue
+            << " maxDiff/maxValue=" << maxDiff / maxValue << "\n\n";
+}
+
+void compareValue(const vector<ParameterPtr>& parametersA,
+                  const vector<ParameterPtr>& parametersB,
+                  double maxDiffRatio = 0.0) {
+  LOG(INFO) << "\n\n--------------------------------"
+            << " Check Gradient Machine Parameters:"
+            << " -------------------------------------\n";
+  for (size_t i = 0; i < parametersA.size(); ++i) {
+    ParameterPtr parameterA, parameterB;
+    parameterA = parametersA[i];
+    parameterB = parametersB[i];
+
+    CpuVector paraA(parameterA->getSize());
+    CpuVector paraB(parameterB->getSize());
+    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
+    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
+
+    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
+              << " ; size : " << paraA.getSize() << " ------------";
+    checkBuffer(paraA.getData(),
+                "para_A",
+                paraB.getData(),
+                "para_B",
+                paraA.getSize(),
+                maxDiffRatio);
+  }
+}
+
+TEST(compareSparse, cpu) {
+  FLAGS_local = 1;  // disable remote sparse update in parameter config
+  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
+  compareValue(getDenseParameters(), parameters);
+}
+
+TEST(compareSparse, remote_cpu) {
+  FLAGS_local = 0;  // will enable remote sparse update
+  FLAGS_ports_num_for_sparse = 5;
+  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
+  compareValue(getDenseParameters(), parameters);
+}
+
+TEST(compareSparse, cpu10_local_vs_remote) {
+  FLAGS_local = 1;  // disable remote sparse update in parameter config
+  std::vector<ParameterPtr> localParameters =
+      trainerOnePassTest(configFile1, true, 2);
+
+  FLAGS_local = 0;  // will enable remote sparse update
+  FLAGS_ports_num_for_sparse = 5;
+  std::vector<ParameterPtr> remoteParameters =
+      trainerOnePassTest(configFile1, true, 2);
+
+  compareValue(localParameters, remoteParameters);
+}
+
+TEST(compareSparse, multiGradientMachine) {
+  int numGpu;
+#ifdef PADDLE_TYPE_DOUBLE
+  double eps = 1e-8;
+#else
+  double eps = 1e-4;
+#endif
+  numGpu = hl_get_device_count();
+  for (bool local : {false, true}) {
+    FLAGS_local = local;
+    FLAGS_ports_num_for_sparse = 5;
+    for (bool useGpu : {false, true}) {
+#ifndef PADDLE_WITH_CUDA
+      if (useGpu) continue;
+#endif
+      FLAGS_parallel_nn = useGpu;
+      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
+      int trainerCount = useGpu ? numGpu : 2;
+      std::vector<ParameterPtr> parameters =
+          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
+      compareValue(getDenseParameters(), parameters, eps);
+    }
+  }
+  FLAGS_parallel_nn = false;
+}
+
+TEST(compareSparse, NeuralNetwork) {
+#ifdef PADDLE_TYPE_DOUBLE
+  double eps = 1e-8;
+#else
+  double eps = 1e-4;
+#endif
+  for (bool local : {false, true}) {
+    FLAGS_local = local;
+    FLAGS_ports_num_for_sparse = 5;
+    for (bool useGpu : {false, true}) {
+#ifndef PADDLE_WITH_CUDA
+      if (useGpu) continue;
+#endif
+      FLAGS_parallel_nn = useGpu;
+      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
+      int trainerCount = 1;
+      std::vector<ParameterPtr> parameters =
+          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
+      compareValue(getDenseParameters(), parameters, useGpu ? eps : 0);
+    }
+  }
+  FLAGS_parallel_nn = false;
+}
+
+int main(int argc, char** argv) {
+  // FIXME(tonyyang-svail):
+  //   Turn off this test due CI failure:
+  //   https://paddleci.ngrok.io/viewLog.html?buildId=27608&buildTypeId=Paddle_PrCi&tab=buildLog&_focus=10430
+  return 0;
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  initPython(argc, argv);
+
+  gNumDevices = hl_get_device_count();
+  FLAGS_num_passes = 1;          // train one pass
+  FLAGS_saving_period = 100000;  // do not save parameter
+
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp b/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ac86ce516afa751b5625293be901ffa81eb698a
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
@@ -0,0 +1,210 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/utils/PythonUtil.h>
+#include <algorithm>
+#include <cstdlib>
+
+#include "paddle/trainer/Trainer.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(gpu_id);
+
+DECLARE_bool(local);
+DECLARE_bool(use_gpu);
+
+DECLARE_string(config);
+DECLARE_string(nics);
+
+DEFINE_bool(need_high_accuracy,
+            false,
+            "whether need to run in double accuracy");
+DEFINE_double(
+    max_diff_ratio,
+    0.0f,
+    "max diff ratio allowed for outputs and parameters (value/gradient)");
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_int32(seed);
+
+static const string& config_file_a =
+    "legacy/gserver/tests/sequence_recurrent.py";
+static const string& config_file_b =
+    "legacy/gserver/tests/sequence_recurrent_group.py";
+
+struct ComData {
+  vector<Argument> outArgs;
+  vector<ParameterPtr> parameters;
+};
+
+void calcGradient(ComData& data, const string configFile) {
+  FLAGS_config = configFile;
+
+  FLAGS_local = true;
+  FLAGS_use_gpu = false;
+
+  FLAGS_nics = "";
+
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  srand(FLAGS_seed);
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
+
+  data.parameters = trainer.getGradientMachine()->getParameters();
+
+  DataBatch dataBatch;
+  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
+
+  trainer.getDataProvider()->reset();
+  trainer.getDataProvider()->setSkipShuffle();
+  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
+
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+  vector<Argument>& inArgs = dataBatch.getStreams();
+
+  trainer.getGradientMachine()->start();
+  trainer.getGradientMachine()->forwardBackward(
+      inArgs, &data.outArgs, PASS_TRAIN);
+
+  trainer.getGradientMachine()->finish();
+}
+
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
+  int nNum = 0;
+  real maxVal = 0;
+  for (size_t i = 0; i < len; ++i) {
+    maxVal = std::max(maxVal, std::max(A[i], B[i]));
+  }
+  real maxDiff = 0;
+  for (size_t i = 0; i < len; ++i) {
+    real diff = fabs(A[i] - B[i]);
+    maxDiff = std::max(maxDiff, diff);
+    if (diff > maxVal * FLAGS_max_diff_ratio) {
+      nNum++;
+      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i] << "    "
+              << desB << " : " << B[i] << " diff=" << diff;
+    }
+  }
+  EXPECT_EQ(0, nNum);
+  LOG(INFO) << "maxValue=" << maxVal << " maxDiff=" << maxDiff << "\n\n";
+}
+
+void compareGradient(ComData& comDataA, ComData& comDataB) {
+  vector<Argument> outArgsA = comDataA.outArgs;
+  vector<Argument> outArgsB = comDataB.outArgs;
+
+  for (size_t i = 0; i < outArgsA.size(); ++i) {
+    CpuMatrix matA(outArgsA[i].value->getHeight(),
+                   outArgsA[i].value->getWidth());
+    CpuMatrix matB(outArgsB[i].value->getHeight(),
+                   outArgsB[i].value->getWidth());
+
+    matA.copyFrom(*outArgsA[i].value);
+    matB.copyFrom(*outArgsB[i].value);
+
+    LOG(INFO) << "\n--------------------------------"
+              << " Check Network Output_" << i << ":"
+              << " -------------------------------------\n";
+    checkBuffer(matA.getData(),
+                "network A output",
+                matB.getData(),
+                "network B output",
+                matA.getElementCnt(),
+                matA.getWidth());
+  }
+
+  vector<ParameterPtr>& parametersA = comDataA.parameters;
+  vector<ParameterPtr>& parametersB = comDataB.parameters;
+
+  LOG(INFO) << "\n\n--------------------------------"
+            << " Check Gradient Machine Parameters:"
+            << " -------------------------------------\n";
+  for (size_t i = 0; i < parametersA.size(); ++i) {
+    ParameterPtr parameterA, parameterB;
+    parameterA = parametersA[i];
+    parameterB = parametersB[i];
+
+    CpuVector paraA(parameterA->getSize());
+    CpuVector paraB(parameterB->getSize());
+    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
+    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
+
+    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
+              << " ; size : " << paraA.getSize() << " ------------";
+    checkBuffer(paraA.getData(),
+                "Network A",
+                paraB.getData(),
+                "Network B",
+                paraA.getSize());
+
+    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
+    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
+
+    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
+              << " ; size : " << gradA.getSize() << " -----------";
+    checkBuffer(gradA.getData(),
+                "Network A",
+                gradB.getData(),
+                "Network B",
+                gradA.getSize());
+  }
+}
+
+TEST(Trainer, create) {
+  ComData dataA;
+  calcGradient(dataA, config_file_a);
+  LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
+
+  ComData dataB;
+  calcGradient(dataB, config_file_b);
+  LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
+
+  compareGradient(dataA, dataB);
+}
+
+int main(int argc, char** argv) {
+  FLAGS_thread_local_rand_use_global_seed = true;
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  initPython(argc, argv);
+
+#ifndef PADDLE_TYPE_DOUBLE
+  if (FLAGS_need_high_accuracy) {
+    LOG(INFO) << "skip test due to it's need high accuracy";
+    return 0;
+  }
+  if (FLAGS_max_diff_ratio == 0.0f) {
+    FLAGS_max_diff_ratio = 1e-5;
+    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
+              << " in low accuracy mode";
+  }
+#else
+  if (FLAGS_max_diff_ratio == 0.0f) {
+    FLAGS_max_diff_ratio = 1e-10;
+    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
+              << " in high accuracy mode";
+  }
+#endif
+
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
diff --git a/paddle/legacy/gserver/tests/test_ConvTrans.cpp b/paddle/legacy/gserver/tests/test_ConvTrans.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..41a03f3b44c8728ee48bf29dd1596c7af978a157
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_ConvTrans.cpp
@@ -0,0 +1,244 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
+
+// Test that the convTrans forward is the same as conv backward
+TEST(Layer, convTransLayerFwd) {
+  // Setting up conv-trans layer
+  TestConfig configt;
+  configt.biasSize = 3;
+  configt.layerConfig.set_type("exconvt");
+  configt.layerConfig.set_num_filters(3);
+  configt.layerConfig.set_partial_sum(1);
+  configt.layerConfig.set_shared_biases(true);
+
+  configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(3 / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                               configt.layerConfig.num_filters());
+  configt.layerConfig.set_name("convTrans");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "convTrans", 100, false, false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convtLayer;
+  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
+  convtLayer->getBiasParameter()->zeroMem();
+  convtLayer->forward(PASS_GC);
+
+  // Setting up conv-layer config
+  TestConfig config;
+  config.biasSize = 16;
+  config.layerConfig.set_type("exconv");
+  config.layerConfig.set_num_filters(16);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384});
+  input = config.layerConfig.add_inputs();
+  conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(3);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers2;
+  LayerMap layerMap2;
+  vector<Argument> datas2;
+  initDataLayer(
+      config, &dataLayers2, &datas2, &layerMap2, "conv", 100, false, false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters2;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap2, &parameters2, &convLayer);
+
+  // Sync convLayer and convtLayer parameter
+  convLayer->getBiasParameter()->zeroMem();
+  convLayer->getParameters()[0]
+      ->getBuf(PARAMETER_VALUE)
+      ->copyFrom(*(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)));
+
+  // Set convLayer outputGrad as convTransLayer input value
+  convLayer->forward(PASS_GC);
+  convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue()));
+
+  vector<int> callbackFlags(parameters2.size(), 0);
+  auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
+  convLayer->backward(callback);
+
+  // Check that the convLayer backward is the same as convTransLayer forward
+  checkMatrixEqual(convtLayer->getOutputValue(),
+                   dataLayers2[0]->getOutputGrad());
+}
+
+// Do one forward pass of convTrans layer and check to see if its output
+// matches the given result
+void doOneConvtTest(size_t imgSize,
+                    size_t output_x,
+                    size_t stride,
+                    size_t padding,
+                    size_t filter_size,
+                    MatrixPtr& result) {
+  TestConfig configt;
+  configt.biasSize = 1;
+  configt.layerConfig.set_type("exconvt");
+  configt.layerConfig.set_num_filters(1);
+  configt.layerConfig.set_partial_sum(1);
+  configt.layerConfig.set_shared_biases(true);
+
+  configt.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", output_x * output_x, filter_size * filter_size});
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(filter_size);
+  conv->set_filter_size_y(filter_size);
+  conv->set_channels(1);
+  conv->set_padding(padding);
+  conv->set_padding_y(padding);
+  conv->set_stride(stride);
+  conv->set_stride_y(stride);
+  conv->set_groups(1);
+  conv->set_filter_channels(1);
+  conv->set_img_size(imgSize);
+  conv->set_output_x(output_x);
+
+  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                               configt.layerConfig.num_filters());
+  configt.layerConfig.set_name("convTrans");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "convTrans", 1, false, false);
+  dataLayers[0]->getOutputValue()->zeroMem();
+  dataLayers[0]->getOutputValue()->add(1.0);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convtLayer;
+  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
+  convtLayer->getBiasParameter()->zeroMem();
+  convtLayer->getParameters()[0]->zeroMem();
+  convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0);
+  convtLayer->forward(PASS_GC);
+
+  checkMatrixEqual(convtLayer->getOutputValue(), result);
+}
+
+TEST(Layer, convTransLayerFwd2) {
+  MatrixPtr result;
+  result = Matrix::create(1, 5 * 5, false, false);
+  result->zeroMem();
+  result->add(1.0);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 1,
+                 /* stride */ 1,
+                 /* padding */ 0,
+                 /* filter_size */ 5,
+                 result);
+
+  real resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
+                       4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
+  result->setData(resultData);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 1,
+                 /* padding */ 0,
+                 /* filter_size */ 4,
+                 result);
+
+  real resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
+                        4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
+  result->setData(resultData2);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 2,
+                 /* padding */ 1,
+                 /* filter_size */ 5,
+                 result);
+
+  real resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4,
+                        2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1};
+  result->setData(resultData3);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 2,
+                 /* padding */ 0,
+                 /* filter_size */ 3,
+                 result);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_ConvUnify.cpp b/paddle/legacy/gserver/tests/test_ConvUnify.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a01a2b69374bc6e086feaee6ce84737ab034244f
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_ConvUnify.cpp
@@ -0,0 +1,315 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
+
+// Do one forward pass of ConvLayer using either exconv or cudnn_conv
+MatrixPtr doOneConvTest(size_t imgSize,
+                        size_t output_x,
+                        size_t stride,
+                        size_t padding,
+                        size_t filter_size,
+                        size_t channel,
+                        size_t numfilters,
+                        size_t groups,
+                        MatrixPtr& inputData,
+                        real* param,
+                        bool useGpu,
+                        bool isDeconv = false) {
+  TestConfig config;
+  config.biasSize = numfilters;
+  string layerType;
+  if (useGpu) {
+    layerType = (isDeconv) ? "cudnn_convt" : "cudnn_conv";
+  } else {
+    layerType = (isDeconv) ? "exconvt" : "exconv";
+  }
+  config.layerConfig.set_type(layerType);
+  config.layerConfig.set_num_filters(numfilters);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  size_t weightSize = channel * filter_size * filter_size *
+                      config.layerConfig.num_filters() / groups;
+  if (isDeconv) {
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", output_x * output_x * channel, weightSize});
+    config.layerConfig.set_size(imgSize * imgSize *
+                                config.layerConfig.num_filters());
+  } else {
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
+    config.layerConfig.set_size(output_x * output_x *
+                                config.layerConfig.num_filters());
+  }
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(filter_size);
+  conv->set_filter_size_y(filter_size);
+  conv->set_channels(channel);
+  conv->set_padding(padding);
+  conv->set_padding_y(padding);
+  conv->set_stride(stride);
+  conv->set_stride_y(stride);
+  conv->set_groups(groups);
+  conv->set_img_size(imgSize);
+  conv->set_output_x(output_x);
+
+  if (isDeconv) {
+    conv->set_filter_channels(numfilters / groups);
+  } else {
+    conv->set_filter_channels(channel / groups);
+  }
+
+  config.layerConfig.set_name("conv");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      config, &dataLayers, &datas, &layerMap, "conv", 1, false, useGpu);
+  dataLayers[0]->getOutputValue()->zeroMem();
+  dataLayers[0]->getOutputValue()->copyFrom(*inputData);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap, &parameters, &convLayer);
+  convLayer->getBiasParameter()->zeroMem();
+  convLayer->getParameters()[0]->zeroMem();
+  convLayer->getParameters()[0]
+      ->getBuf(PARAMETER_VALUE)
+      ->copyFrom(param, weightSize);
+  convLayer->forward(PASS_GC);
+
+  return convLayer->getOutputValue();
+}
+
+TEST(Layer, convParaUnified) {
+#ifdef PADDLE_WITH_CUDA
+  MatrixPtr input, resultCpu, resultGpu;
+
+  /// TEST1 for conv ///
+  input = Matrix::create(1, 4 * 4, false, false);
+  real inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  real param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+
+  input->setData(inputData);
+
+  resultCpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST1 for deconv ///
+  input = Matrix::create(1, 2 * 2, false, false);
+  real inputDataT[] = {1, 2, 3, 4};
+  input->setData(inputDataT);
+
+  resultCpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST2 for conv ///
+  input = Matrix::create(1, 3 * 3 * 2, false, false);
+  real inputData2[] = {
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
+  real param2[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1};
+
+  input->setData(inputData2);
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST3 for conv ///
+  real param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST2 for deconv ///
+  input = Matrix::create(1, 2 * 2 * 2, false, false);
+  real inputData2T[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  input->setData(inputData2T);
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST3 for deconv ///
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+#endif
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..34eb0dedffeba46c662a0e69ce9ba82f474a8358
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
@@ -0,0 +1,352 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+#include <sstream>
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+const size_t MAX_SEQ_NUM = 23;
+const size_t MAX_SEQ_LEN = 50;
+const size_t MAX_BEAM_SIZE = 27;
+
+const size_t SEED = (size_t)(time(NULL));
+
+struct SingleBeamExpansion {
+  vector<int> seqStartPos;
+  vector<int> subSeqStartPos;
+  vector<real> candidateScores;
+
+  // TODO(caoying): store this into Argument.ids
+  vector<real> selectedIndices;
+
+  vector<int> groundTruth;
+  vector<size_t> inBeam;
+  vector<int> rowIdxInBeam;
+  vector<int> colIdxInBeam;
+
+  void resetGroundTruth(size_t n) {
+    groundTruth.clear();
+    groundTruth.resize(n, -1);
+
+    inBeam.clear();
+    inBeam.resize(n, 0);
+
+    rowIdxInBeam.clear();
+    rowIdxInBeam.resize(n, -1);
+
+    colIdxInBeam.clear();
+    colIdxInBeam.resize(n, -1);
+  }
+};
+
+inline float randFloat() {
+  return static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
+}
+
+void genRand(real* numbers, size_t n) {
+  default_random_engine generator;
+  uniform_real_distribution<real> distribution(0.0, 1.0);
+  for (size_t i = 0; i < n; ++i) numbers[i] = distribution(generator);
+}
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+void genCandidateScores(bool hasSubseq,
+                        size_t beamSize,
+                        SingleBeamExpansion& prevBeam,
+                        SingleBeamExpansion& curBeam) {
+  vector<int>& seqStartPos = curBeam.seqStartPos;
+  seqStartPos.resize(1, 0);
+  vector<int>& subSeqStartPos = curBeam.subSeqStartPos;
+  subSeqStartPos.resize(1, 0);
+
+  srand(SEED);
+  if (prevBeam.selectedIndices.size()) {
+    if (prevBeam.subSeqStartPos.size() > 1) {
+      int seqIdx = 1;
+      // samples in previous beam are nested sequences.
+      for (size_t i = 1; i < prevBeam.subSeqStartPos.size(); ++i) {
+        for (size_t j = 0; j < beamSize; ++j) {
+          if (prevBeam.selectedIndices[(i - 1) * beamSize + j] == -1.) break;
+          subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) +
+                                   subSeqStartPos.back());
+        }
+        if (prevBeam.seqStartPos[seqIdx] == prevBeam.subSeqStartPos[i]) {
+          seqStartPos.push_back(subSeqStartPos.back());
+          seqIdx++;
+        }
+      }
+    } else {
+      for (size_t i = 0; i <= prevBeam.selectedIndices.size(); ++i) {
+        if (i && i % beamSize == 0) {
+          seqStartPos.push_back(subSeqStartPos.back());
+          if (i == prevBeam.selectedIndices.size()) break;
+        }
+        if (prevBeam.selectedIndices[i] == -1.) continue;
+        subSeqStartPos.push_back(subSeqStartPos.back() +
+                                 (1 + (rand() % MAX_SEQ_LEN)));
+      }
+    }
+  } else {
+    // the first beam expansion
+    int seqNum = 1 + (rand() % MAX_SEQ_NUM);
+    for (int i = 0; i < seqNum; ++i) {
+      if (hasSubseq) {
+        for (size_t j = 0; j < 1 + (rand() % MAX_SEQ_NUM); ++j)
+          subSeqStartPos.push_back(subSeqStartPos.back() +
+                                   (1 + (rand() % MAX_SEQ_LEN)));
+        seqStartPos.push_back(subSeqStartPos.back());
+      } else {
+        seqStartPos.push_back(seqStartPos.back() +
+                              (1 + (rand() % MAX_SEQ_LEN)));
+      }
+    }
+  }
+
+  size_t totalSeqNum = hasSubseq ? subSeqStartPos.back() : seqStartPos.back();
+  curBeam.candidateScores.resize(totalSeqNum, 0.);
+  genRand(curBeam.candidateScores.data(), totalSeqNum);
+}
+
+void genSelectedIndices(size_t beamSize,
+                        vector<int>& seqStartPos,
+                        vector<real>& selectedIndices) {
+  size_t selectedIdsCount = beamSize * (seqStartPos.size() - 1);
+  selectedIndices.resize(selectedIdsCount, -1.);
+
+  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
+    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+    int n = min(seqLen, static_cast<int>(beamSize));
+    vector<real> ids = randSampling(seqLen, n);
+    memcpy(selectedIndices.data() + i * beamSize,
+           ids.data(),
+           sizeof(real) * ids.size());
+  }
+}
+
+void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
+                    size_t beamSize) {
+  SingleBeamExpansion& beam = beamExpansions[1];
+  size_t seqNum = beam.seqStartPos.size() - 1;
+  for (size_t i = 2; i < beamExpansions.size(); ++i)
+    CHECK_EQ(seqNum, beamExpansions[i].seqStartPos.size() - 1);
+
+  srand(SEED);
+
+  // initialize the first beam.
+  beam.resetGroundTruth(seqNum);
+  for (size_t i = 0; i < seqNum; ++i) {
+    if (randFloat() > 0.5) {
+      /*
+       * force the randomly generated label falls in the beam by chance 0.5.
+       * otherwise, when sequence length is relatively long and beam size is
+       * relatively small, the gold sequences falls off the beam at in the
+       * first search.
+       */
+      real* begPos = beam.selectedIndices.data() + i * beamSize;
+      beam.colIdxInBeam[i] =
+          rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
+            return val != -1.;
+          });
+      beam.groundTruth[i] =
+          beam.selectedIndices[i * beamSize + beam.colIdxInBeam[i]];
+      beam.inBeam[i] = 1;
+    } else {
+      int label = rand() % (beam.seqStartPos[i + 1] - beam.seqStartPos[i]);
+      beam.groundTruth[i] = label;
+
+      real* begPos = beam.selectedIndices.data() + i * beamSize;
+      real* endPos = begPos + beamSize;
+      real* lblPos = find(begPos, endPos, real(label));
+      if (lblPos != endPos) {
+        beam.inBeam[i] = 1;
+        beam.colIdxInBeam[i] = lblPos - begPos;
+      }
+    }
+    beam.rowIdxInBeam[i] = i;
+  }
+
+  // iterate over each beam expansions
+  for (size_t i = 2; i < beamExpansions.size(); ++i) {
+    SingleBeamExpansion& curBeam = beamExpansions[i];
+    SingleBeamExpansion& prevBeam = beamExpansions[i - 1];
+    curBeam.resetGroundTruth(seqNum);
+
+    // iterate over each sequence
+    for (size_t j = 0; j < seqNum; ++j) {
+      if (!prevBeam.inBeam[j]) continue;
+
+      // gold sequence falls in the beam in previous search.
+      real* begPos = prevBeam.selectedIndices.data();
+      int offset =
+          prevBeam.rowIdxInBeam[j] * beamSize + prevBeam.colIdxInBeam[j];
+      curBeam.rowIdxInBeam[j] = count_if(
+          begPos, begPos + offset, [](const real& val) { return val != -1.; });
+
+      if (randFloat() > 0.5) {
+        // force the randomly generated label falls in the beam by chance 0.5.
+
+        real* start =
+            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
+        int n = rand() % count_if(start, start + beamSize, [](const real& val) {
+                  return val != -1.;
+                });
+        curBeam.colIdxInBeam[j] = n;
+        curBeam.groundTruth[j] = *(start + n);
+        curBeam.inBeam[j] = 1;
+      } else {
+        CHECK_LE((size_t)curBeam.rowIdxInBeam[j] + 1,
+                 curBeam.subSeqStartPos.size() - 1);
+        int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]];
+        int end = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j] + 1];
+        CHECK_GT(size_t(end), size_t(start));
+        int label = rand() % (end - start);
+
+        curBeam.groundTruth[j] = label;
+        real* findBeg =
+            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
+        real* lblPos =
+            find(findBeg, findBeg + beamSize, static_cast<real>(label));
+        if (lblPos != (findBeg + beamSize)) {
+          curBeam.inBeam[j] = 1;
+          curBeam.colIdxInBeam[j] = lblPos - findBeg;
+        }
+      }
+    }
+  }
+}
+
+void genOneBeam(size_t beamSize,
+                bool hasSubseq,
+                SingleBeamExpansion& prevBeam,
+                SingleBeamExpansion& curBeam) {
+  genCandidateScores(hasSubseq, beamSize, prevBeam, curBeam);
+  genSelectedIndices(beamSize,
+                     hasSubseq ? curBeam.subSeqStartPos : curBeam.seqStartPos,
+                     curBeam.selectedIndices);
+}
+
+void genRandomBeamExpansion(size_t expansionCount,
+                            size_t beamSize,
+                            vector<SingleBeamExpansion>& beamExpansions) {
+  beamExpansions.clear();
+  beamExpansions.resize(expansionCount + 1);
+
+  // beamExpansions[0] is reserved.
+  for (size_t i = 1; i <= expansionCount; ++i)
+    genOneBeam(beamSize, bool(i - 1), beamExpansions[i - 1], beamExpansions[i]);
+  genGroundTruth(beamExpansions, beamSize);
+}
+
+void testCrossEntropyOverBeam(bool useGpu,
+                              size_t beamSize,
+                              vector<SingleBeamExpansion>& beams) {
+  TestConfig config;
+  config.layerConfig.set_type("cross_entropy_over_beam");
+
+  size_t seqNum = 0;
+  for (size_t i = 1; i < beams.size(); ++i) {
+    const SingleBeamExpansion& beam = beams[i];
+    // create scores for all the candidates
+    MatrixPtr candidateScorePtr =
+        Matrix::create(beam.candidateScores.size(), 1, false, false);
+    candidateScorePtr->copyFrom(beam.candidateScores.data(),
+                                beam.candidateScores.size());
+
+    ostringstream paramName;
+    paramName << "candidate_scores_" << i;
+
+    if (beam.subSeqStartPos.size() > 1) {
+      seqNum = beam.subSeqStartPos.size() - 1;
+      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                  paramName.str(),
+                                  candidateScorePtr,
+                                  beam.seqStartPos,
+                                  beam.subSeqStartPos});
+    } else {
+      seqNum = beam.seqStartPos.size() - 1;
+      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                  paramName.str(),
+                                  candidateScorePtr,
+                                  beam.seqStartPos});
+    }
+    config.layerConfig.add_inputs();
+
+    // create indices for the selected candidates
+    MatrixPtr selectedCandidates =
+        Matrix::create(seqNum, beamSize, false, false);
+    selectedCandidates->copyFrom(beam.selectedIndices.data(),
+                                 beam.selectedIndices.size());
+    paramName.clear();
+    paramName << "selected_candidates_" << i;
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, paramName.str(), selectedCandidates});
+    config.layerConfig.add_inputs();
+
+    // create the ground truth
+    paramName.clear();
+    paramName << "label_" << i;
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, paramName.str(), beam.groundTruth});
+    config.layerConfig.add_inputs();
+  }
+
+  testLayerGrad(
+      config, "cross_entropy_over_beam", seqNum, false, useGpu, false);
+}
+
+TEST(Layer, CrossEntropyOverBeam) {
+  LOG(INFO) << "SEED = " << SEED;
+  const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
+  LOG(INFO) << "beamSize = " << beamSize;
+
+  // TODO(caoying): test with random beam expansions.
+  const size_t expansionCount = 3;
+  vector<SingleBeamExpansion> beams;
+  genRandomBeamExpansion(expansionCount, beamSize, beams);
+
+  for (bool useGpu : {false, true})
+    testCrossEntropyOverBeam(useGpu, beamSize, beams);
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(SEED);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_DetectionOutput.cpp b/paddle/legacy/gserver/tests/test_DetectionOutput.cpp
similarity index 100%
rename from paddle/gserver/tests/test_DetectionOutput.cpp
rename to paddle/legacy/gserver/tests/test_DetectionOutput.cpp
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/legacy/gserver/tests/test_Evaluator.cpp
similarity index 100%
rename from paddle/gserver/tests/test_Evaluator.cpp
rename to paddle/legacy/gserver/tests/test_Evaluator.cpp
diff --git a/paddle/gserver/tests/test_Expand.cpp b/paddle/legacy/gserver/tests/test_Expand.cpp
similarity index 100%
rename from paddle/gserver/tests/test_Expand.cpp
rename to paddle/legacy/gserver/tests/test_Expand.cpp
diff --git a/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp b/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a1cfdc705b6c38859f816e062bc68c051bf48f7
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
@@ -0,0 +1,164 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+vector<int> randSampling(int range, int n) {
+  CHECK_GE(range, n);
+  vector<int> num(range);
+  iota(begin(num), end(num), 0);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  return num;
+}
+
+void genRandomSeqInfo(vector<int>& seqStartPosition,
+                      vector<int>& subSeqStartPosition) {
+  const int maxSeqNum = 100;
+  // generate random start position information
+  int seqNum = 1 + (rand() % maxSeqNum);
+  seqStartPosition.resize(seqNum + 1, 0);
+  subSeqStartPosition.resize(1, 0);
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqLen = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqLen; ++j)
+      subSeqStartPosition.push_back(subSeqStartPosition.back() + subSeqLen);
+    seqStartPosition[i + 1] = subSeqStartPosition.back();
+  }
+}
+
+void genRandomGroundTruth(real* values,
+                          vector<vector<int>>& groundTruth,
+                          vector<int>& startPos,
+                          size_t beamSize) {
+  groundTruth.resize(startPos.size() - 1, vector<int>(beamSize, -1));
+  for (size_t i = 0; i < startPos.size() - 1; ++i) {
+    int seqLen = startPos[i + 1] - startPos[i];
+    vector<int> pos =
+        randSampling(seqLen, min(static_cast<int>(beamSize), seqLen));
+    for (size_t j = 0; j < pos.size(); ++j) {
+      groundTruth[i][j] = pos[j];
+      values[startPos[i] + pos[j]] = 1.;
+    }
+  }
+}
+
+void checkLayerOut(vector<vector<int>> groundTruth,
+                   real* layerOut,
+                   size_t beamSize) {
+  for (size_t i = 0; i < groundTruth.size(); ++i) {
+    int begPos = i * beamSize;
+    vector<real> tmp(layerOut + begPos, layerOut + begPos + beamSize);
+    sort(begin(tmp), end(tmp));
+    sort(begin(groundTruth[i]), end(groundTruth[i]));
+    for (size_t j = 0; j < beamSize; ++j) CHECK_EQ(tmp[j], groundTruth[i][j]);
+  }
+}
+
+TEST(Layer, kmaxSeqScoreLayer) {
+  const size_t maxBeamSize = 100;
+  size_t beamSize = 1 + (rand() % maxBeamSize);
+
+  vector<int> seqStartPosition;
+  vector<int> subSeqStartPosition;
+  genRandomSeqInfo(seqStartPosition, subSeqStartPosition);
+  MatrixPtr inValue =
+      Matrix::create(subSeqStartPosition.back(), 1, false, false);
+
+  std::vector<bool> mode = {false};
+#ifdef PADDLE_WITH_CUDA
+  mode.push_back(true);
+#endif
+
+  for (auto hasSubseq : {false, true}) {
+    vector<vector<int>> groundTruth;
+    inValue->randomizeUniform();
+    genRandomGroundTruth(inValue->getData(),
+                         groundTruth,
+                         hasSubseq ? subSeqStartPosition : seqStartPosition,
+                         beamSize);
+
+    for (auto useGpu : mode) {
+      TestConfig config;
+      config.layerConfig.set_type("kmax_seq_score");
+      config.layerConfig.set_beam_size(beamSize);
+
+      if (hasSubseq) {
+        config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                    "scores",
+                                    inValue,
+                                    seqStartPosition,
+                                    subSeqStartPosition});
+      } else {
+        config.inputDefs.push_back(
+            {INPUT_SELF_DEFINE_DATA, "scores", inValue, seqStartPosition});
+      }
+      config.layerConfig.add_inputs();
+
+      // data layer initialize
+      std::vector<DataLayerPtr> dataLayers;
+      LayerMap layerMap;
+      vector<Argument> datas;
+      initDataLayer(
+          config,
+          &dataLayers,
+          &datas,
+          &layerMap,
+          "kmax_seq_score",
+          100 /* actually this parameter is unused in self-defined input*/,
+          false,
+          useGpu);
+      // test layer initialize
+      std::vector<ParameterPtr> parameters;
+      LayerPtr kmaxSeqScoreLayer;
+      FLAGS_use_gpu = useGpu;
+      initTestLayer(config, &layerMap, &parameters, &kmaxSeqScoreLayer);
+      kmaxSeqScoreLayer->forward(PASS_TRAIN);
+
+      const MatrixPtr outValue = kmaxSeqScoreLayer->getOutputValue();
+      CHECK_EQ(outValue->getHeight(),
+               hasSubseq ? subSeqStartPosition.size() - 1
+                         : seqStartPosition.size() - 1);
+      CHECK_EQ(outValue->getWidth(), beamSize);
+      checkLayerOut(groundTruth, outValue->getData(), beamSize);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand((size_t)(time(NULL)));
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_LayerGrad.cpp b/paddle/legacy/gserver/tests/test_LayerGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..979cf8ee673291d66f8704f2deda6c7160f4b228
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_LayerGrad.cpp
@@ -0,0 +1,2532 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+#include <cudnn.h>
+#endif
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
+
+TEST(Operator, dot_mul) {
+  TestConfig config;
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
+  operatorConf.set_type("dot_mul");
+  operatorConf.set_dotmul_scale(-1);
+
+  testOperatorGrad(config, operatorConf, 100, false, false);
+}
+
+TEST(Projection, context) {
+  for (auto contextStart : {-5, -3, -1, 0, 3}) {
+    for (auto contextLength : {1, 2, 5, 7}) {
+      for (auto batchSize : {1, 2, 5, 20}) {
+        for (auto trainablePadding : {false, true}) {
+          LOG(INFO) << " contextStart=" << contextStart
+                    << " contextLength=" << contextLength
+                    << " batchSize=" << batchSize
+                    << " trainablePadding=" << trainablePadding;
+          ProjectionConfig conf;
+          conf.set_type("context");
+          conf.set_input_size(10);
+          conf.set_context_start(contextStart);
+          conf.set_context_length(contextLength);
+          conf.set_trainable_padding(trainablePadding);
+          conf.set_output_size(conf.context_length() * conf.input_size());
+          int pad =
+              std::max(0, -conf.context_start()) +
+              std::max(0, conf.context_start() + conf.context_length() - 1);
+          for (auto useGpu : {false, true}) {
+            testProjectionGrad(
+                conf,
+                INPUT_SEQUENCE_DATA,
+                trainablePadding ? conf.input_size() * pad : 0,
+                batchSize,
+                useGpu,
+                contextStart + contextLength <= 1);  // = testState
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Projection, trans_fc) {
+  ProjectionConfig conf;
+  conf.set_type("trans_fc");
+  conf.set_input_size(50);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 1000,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, fc) {
+  ProjectionConfig conf;
+  conf.set_type("fc");
+  conf.set_input_size(10);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 200,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, dot_mul) {
+  ProjectionConfig conf;
+  conf.set_type("dot_mul");
+  conf.set_input_size(20);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 20,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, table) {
+  ProjectionConfig conf;
+  conf.set_type("table");
+  conf.set_input_size(10);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_LABEL,
+                       /* parameterSize */ 200,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, identity) {
+  ProjectionConfig conf;
+  conf.set_type("identity");
+  conf.set_input_size(10);
+  conf.set_output_size(10);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, slice) {
+  ProjectionConfig conf;
+  conf.set_type("slice");
+  conf.set_input_size(100);
+  SliceConfig& slice1 = *conf.add_slices();
+  slice1.set_start(10);
+  slice1.set_end(20);
+  SliceConfig& slice2 = *conf.add_slices();
+  slice2.set_start(50);
+  slice2.set_end(70);
+  conf.set_output_size(30);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 10,
+                       useGpu);
+  }
+}
+
+TEST(Projection, scaling) {
+  ProjectionConfig conf;
+  conf.set_type("scaling");
+  conf.set_input_size(10);
+  conf.set_output_size(10);
+  for (auto useGpu : {false}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 1,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+void testProjectionConv(size_t groups, bool isDeconv) {
+  const int NUM_FILTERS = 18;
+  const int FILTER_SIZE = 2;
+  const int FILTER_SIZE_Y = 2;
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 16;
+
+#if CUDNN_VERSION >= 6000
+  const int DILATION = 2;
+#else
+  const int DILATION = 1;
+#endif
+
+  ProjectionConfig conf;
+  if (isDeconv) {
+    conf.set_type("convt");
+  } else {
+    conf.set_type("conv");
+  }
+  conf.set_num_filters(NUM_FILTERS);
+
+  ConvConfig* conv = conf.mutable_conv_conf();
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_channels(CHANNELS);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_dilation(DILATION);
+  conv->set_dilation_y(DILATION);
+  conv->set_groups(groups);
+  if (isDeconv) {
+    conv->set_filter_channels(NUM_FILTERS / conv->groups());
+  } else {
+    conv->set_filter_channels(conv->channels() / conv->groups());
+  }
+  conv->set_img_size(IMAGE_SIZE);
+  int output_x = outputSize(conv->img_size(),
+                            (conv->filter_size() - 1) * DILATION + 1,
+                            conv->padding(),
+                            conv->stride(),
+                            /* caffeMode */ true);
+  int output_y = outputSize(conv->img_size(),
+                            (conv->filter_size_y() - 1) * DILATION + 1,
+                            conv->padding_y(),
+                            conv->stride_y(),
+                            /* caffeMode */ true);
+  conv->set_output_x(output_x);
+  conv->set_output_y(output_y);
+  LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
+            << "; output_y: " << output_y;
+  if (isDeconv) {
+    int deconv_image_x = imageSize(output_x,
+                                   (conv->filter_size() - 1) * DILATION + 1,
+                                   conv->padding(),
+                                   conv->stride(),
+                                   /* caffeMode */ true);
+    int deconv_image_y = imageSize(output_y,
+                                   (conv->filter_size_y() - 1) * DILATION + 1,
+                                   conv->padding_y(),
+                                   conv->stride_y(),
+                                   /* caffeMode */ true);
+
+    LOG(INFO) << " deconv_image_x: " << deconv_image_x
+              << "; deconv_image_y: " << deconv_image_y;
+    conf.set_input_size(output_x * output_y * CHANNELS);
+    conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
+  } else {
+    conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+    conf.set_output_size(output_x * output_y * NUM_FILTERS);
+  }
+
+  testProjectionGrad(conf,
+                     INPUT_DATA,
+                     /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE *
+                         FILTER_SIZE_Y / groups,
+                     /* batchSize */ 100,
+                     true,
+                     false,
+                     NUM_FILTERS,
+                     true);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(Projection, conv) {
+  /// test ConvProjection
+  testProjectionConv(1, false);
+  testProjectionConv(3, false);
+  /// test ConvTransProjection
+  testProjectionConv(1, true);
+  testProjectionConv(3, true);
+}
+#endif
+
+TEST(Layer, BilinearInterpLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("bilinear_interp");
+  config.biasSize = 0;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
+  ImageConfig* image = bilinear->mutable_image_conf();
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);
+
+  for (auto useGpu : {false, true}) {
+    for (auto outSize : {32, 64}) {
+      bilinear->set_out_size_x(outSize);
+      bilinear->set_out_size_y(outSize);
+      testLayerGrad(config, "bilinear_interp", 10, false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, concat) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("concat");
+  config.layerConfig.set_size(15);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "concat", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, AddtoLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "addto", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, CTCLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("ctc");
+  config.layerConfig.set_norm_by_times(false);
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "ctc",
+                  100,
+                  /* trans */ false, /* useGpu */
+                  useGpu);
+  }
+}
+
+TEST(Layer, cosSimLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("cos");
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cos", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, CosSimVecMatLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("cos_vm");
+  config.layerConfig.set_size(5);  // output size
+  config.layerConfig.set_cos_scale(2.0);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cos_vm", 100, false, useGpu);
+  }
+}
+
+void testDepthwiseConvLayer(const string& type, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 32;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(32);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(3);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(16);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_img_size_y(8);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
+}
+
+TEST(Layer, depthwiseConvLayer) {
+  //  'depthwise_conv' is a sepecial case of 'exconv' whose
+  //  groups size equals to the input channels size.
+  testDepthwiseConvLayer("exconv", /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  testDepthwiseConvLayer("exconv", /* useGpu= */ true);
+#endif
+}
+
+void testConvLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 16;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(16);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  int dilation = 2;
+  if (type == "cudnn_conv") {
+#if CUDNN_VERSION >= 6000
+    dilation = 2;
+#else
+    dilation = 1;
+#endif
+  }
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 192});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(2);
+  conv->set_channels(3);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_dilation(dilation);
+  conv->set_dilation_y(dilation);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_img_size_y(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                (conv->filter_size() - 1) * dilation + 1,
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                (conv->filter_size_y() - 1) * dilation + 1,
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "conv", 100, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "conv", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, convLayer) {
+  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
+  testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void testConvTransLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 3;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(3);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(3 / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+
+  config.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "convTrans", 100, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "convTrans", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, convTransLayer) {
+  for (auto useGpu : {false, true}) {
+    testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
+  }
+#ifdef PADDLE_WITH_CUDA
+  testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+TEST(Layer, blockExpandLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("blockexpand");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 6144, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  BlockExpandConfig* blockExpand = input->mutable_block_expand_conf();
+  blockExpand->set_img_size_x(64);
+  blockExpand->set_img_size_y(32);
+  blockExpand->set_channels(3);
+  blockExpand->set_padding_x(0);
+  blockExpand->set_padding_y(0);
+  blockExpand->set_block_x(4);
+  blockExpand->set_block_y(32);
+  blockExpand->set_stride_x(2);
+  blockExpand->set_stride_y(2);
+  blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
+                                       blockExpand->block_x(),
+                                       blockExpand->padding_x(),
+                                       blockExpand->stride_x(),
+                                       /* caffeMode */ false));
+  blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
+                                       blockExpand->block_y(),
+                                       blockExpand->padding_y(),
+                                       blockExpand->stride_y(),
+                                       /* caffeMode */ false));
+  config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() *
+                              blockExpand->channels());
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "blockexpand", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, maxoutLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("maxout");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  MaxOutConfig* maxout = input->mutable_maxout_conf();
+  ImageConfig* image = maxout->mutable_image_conf();
+
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);
+  maxout->set_groups(2);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "maxout", 10, false, useGpu);
+  }
+}
+
+void testFcLayer(string format, size_t nnz) {
+  TestConfig config;
+  config.biasSize = 1024;
+  config.layerConfig.set_type("fc");
+  config.layerConfig.set_size(1024);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_drop_rate(0.1);
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", 2048, nnz, ParaSparse(format)});
+  config.layerConfig.add_inputs();
+
+  LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
+            << config.inputDefs[0].sparse.format;
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "fc",
+                  100,
+                  /* trans */ false,
+                  useGpu,
+                  /* weight */ true);
+  }
+}
+
+TEST(Layer, fcLayer) {
+  testFcLayer("", 1024 * 1024 * 2);
+  testFcLayer("csc", 1024 * 10);
+  testFcLayer("csr", 1024 * 10);
+}
+
+TEST(Layer, SelectiveFullyConnectedLayer) {
+  TestConfig config;
+  size_t nin = 16;
+  size_t nout = 256;
+  config.layerConfig.set_type("selective_fc");
+  config.layerConfig.set_size(nout);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_has_selected_colums(true);
+  config.layerConfig.set_selective_fc_pass_generation(false);
+  config.biasSize = nout;
+
+  config.inputDefs.push_back({INPUT_DATA, "input0", nin, nin * nout});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)});
+  config.layerConfig.add_inputs();
+
+  testLayerGrad(config,
+                "selective_fc",
+                100,
+                /* trans= */ false,
+                /* useGup= */ false,
+                false);
+#ifdef PADDLE_WITH_CUDA
+  testLayerGrad(config,
+                "selective_fc",
+                100,
+                /* trans= */ false,
+                /* useGup= */ true,
+                false);
+#endif
+}
+
+TEST(Layer, DataNormLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("data_norm");
+  config.layerConfig.set_size(20);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 100});
+  config.inputDefs.back().isStatic = true;
+  config.layerConfig.add_inputs();
+
+  for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
+    config.layerConfig.set_data_norm_strategy(strategy);
+    // The parameters are static, so not support GPU now
+    testLayerGrad(config,
+                  "data_norm",
+                  200,
+                  /* trans */ false,
+                  /* useGpu */ false);
+  }
+}
+
+TEST(Layer, hsigmoidLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("hsigmoid");
+  config.layerConfig.set_num_classes(5);
+  config.layerConfig.set_size(1);
+  config.biasSize = config.layerConfig.num_classes() - 1;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 200});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 5, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "hsigmoid",
+                  100,
+                  /* trans */ false,
+                  /* useGpu */ useGpu);
+  }
+}
+
+TEST(Layer, multi_cross) {
+  TestConfig config;
+  config.layerConfig.set_type("multi-class-cross-entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(
+        config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, multi_binary_label_sparse_mat) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_binary_label_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "multi_binary_label_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
+  }
+}
+
+TEST(layer, multi_binary_label_id) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_binary_label_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "multi_binary_label_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
+  }
+}
+
+TEST(Layer, multi_cross_with_selfnorm) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_class_cross_entropy_with_selfnorm");
+  config.layerConfig.set_softmax_selfnorm_alpha(0.1);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // Not support GPU now
+  testLayerGrad(config,
+                "multi_class_cross_entropy_with_selfnorm",
+                100,
+                /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, multi_cross_soft) {
+  TestConfig config;
+  config.layerConfig.set_type("soft_binary_class_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "soft_binary_class_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
+  }
+}
+
+TEST(Layer, square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, sparse_square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // "GpuSparseMatrix" as label is not supported
+  testLayerGrad(config,
+                "square_error",
+                100,
+                /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, sparse_float_square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_FLOAT_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // "GpuSparseMatrix" as label is not supported
+  testLayerGrad(config,
+                "square_error",
+                100,
+                /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, square_error_weighted) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+  config.testAccumulate = false;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, huber_regression_loss) {
+  TestConfig config;
+  config.layerConfig.set_type("huber_regression");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto delta : {1, 3, 5}) {
+      config.layerConfig.set_delta(delta);
+      testLayerGrad(config, "huber_regression", 100, /* trans */ false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, huber_two_class) {
+  TestConfig config;
+  config.layerConfig.set_type("huber_classification");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 2, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "huber_two_class", 100, /* trans */ false, useGpu);
+  }
+}
+
+void testExpandLayer(string trans_type, bool hasSubseq) {
+  TestConfig config;
+  config.layerConfig.set_type("expand");
+
+  config.inputDefs.push_back(
+      {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       10,
+       0});
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_1",
+       10,
+       0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.set_trans_type(trans_type);
+  LOG(INFO) << " trans_type=" << trans_type << " hasSubseq=" << hasSubseq;
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "expand", 30, false, useGpu);
+  }
+}
+
+TEST(Layer, ExpandLayer) {
+  testExpandLayer("non-seq", false);  // non-seq expand to seq
+  testExpandLayer("non-seq", true);   // non-seq expand to hasSubseq
+  testExpandLayer("seq", true);       // seq expand to hasSubseq
+}
+
+void testDegradeLayer(bool hasSubseq,
+                      string layer_type,
+                      string trans_type,
+                      int stride) {
+  TestConfig config;
+  config.layerConfig.set_type(layer_type);
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_seq_pool_stride(stride);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       10,
+       0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.set_trans_type(trans_type);
+
+  auto testDegradeLayerGrad = [](TestConfig& config, string layer_type) {
+    for (auto useGpu : {false, true}) {
+      testLayerGrad(config, layer_type, 100, false, useGpu);
+    }
+  };
+
+  if (layer_type == "average") {
+    for (auto strategy : {"average", "sum", "squarerootn"}) {
+      LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
+                << " average_strategy=" << strategy
+                << " seq_pool_stride=" << stride;
+      config.layerConfig.set_average_strategy(strategy);
+      testDegradeLayerGrad(config, layer_type);
+    }
+  } else {
+    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
+              << " seq_pool_stride=" << stride;
+    testDegradeLayerGrad(config, layer_type);
+  }
+}
+
+TEST(Layer, MaxLayer) {
+  testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
+  testDegradeLayer(false,
+                   "max",
+                   "non-seq",
+                   5);  // seq max to a shorten seq, stride window = 5
+  testDegradeLayer(true, "max", "non-seq", -1);  // hasSubseq max to non-seq
+  testDegradeLayer(true, "max", "seq", -1);      // hasSubseq max to seq
+}
+
+TEST(Layer, SequenceLastInstanceLayer) {
+  testDegradeLayer(false,
+                   "seqlastins",
+                   "non-seq",
+                   -1);  // seq seqlastins to non-seq
+  testDegradeLayer(false,
+                   "seqlastins",
+                   "non-seq",
+                   5);  // seq seqlastins to a shorten seq, stride window = 5
+  testDegradeLayer(true,
+                   "seqlastins",
+                   "non-seq",
+                   -1);  // hasSubseq seqlastins to non-seq
+  testDegradeLayer(true,
+                   "seqlastins",
+                   "seq",
+                   -1);  // hasSubseq seqlastins to seq
+}
+
+TEST(Layer, AverageLayer) {
+  testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
+  testDegradeLayer(false,
+                   "average",
+                   "non-seq",
+                   5);  // seq average to a shorten seq, stride window = 5
+  testDegradeLayer(true,
+                   "average",
+                   "non-seq",
+                   -1);                          // hasSubseq average to non-seq
+  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
+}
+
+TEST(Layer, SequenceConcatLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("seqconcat");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "seqconcat", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, SequenceReshapeLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("seqreshape");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 100, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "seqreshape", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ConvShiftLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("conv_shift");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 3, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // Not support GPU now
+  testLayerGrad(config, "conv_shift", 100, false, false);
+}
+
+TEST(Layer, PowerLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("power");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "power", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ConvexCombinationLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("convex_comb");
+  config.layerConfig.set_size(20);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "convex_comb", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, InterpolationLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("interpolation");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_2", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "interpolation", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, DotProdLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("dot_prod");
+  config.layerConfig.set_size(1);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "dot_prod", 10, false, useGpu);
+  }
+}
+
+TEST(Layer, OuterProdLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("out_prod");
+  config.layerConfig.set_size(100);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "out_prod", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, SlopeInterceptLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("slope_intercept");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_slope(1.0);
+  config.layerConfig.set_intercept(0.1);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "slope_intercept", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ScalingLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("scaling");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scaling", 100, false, useGpu);
+  }
+}
+
+void testNormLayer(const string& normType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.layerConfig.set_type("norm");
+  config.layerConfig.set_active_type("relu");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type(normType);
+  norm->set_channels(16);
+  norm->set_size(5);
+  norm->set_scale(0.001);
+  norm->set_pow(0.75);
+  norm->set_blocked(0);
+  norm->set_img_size(14);
+  norm->set_img_size_y(7);
+  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
+  if (norm->norm_type() == "cmrnorm" ||
+      norm->norm_type() == "cmrnorm-projection") {
+    norm->set_scale(norm->scale() / norm->size());
+  } else {
+    norm->set_scale(norm->scale() / (norm->size() * norm->size()));
+  }
+
+  config.layerConfig.set_size(norm->output_x() * norm->output_y() *
+                              norm->channels());
+  config.biasSize = 0;
+
+  testLayerGrad(config, "norm", 100, trans, useGpu);
+}
+
+TEST(Layer, NormLayer) {
+  testNormLayer("cmrnorm-projection",
+                /* trans= */ false, /* useGpu= */
+                true);
+  testNormLayer("cmrnorm-projection",
+                /* trans= */ false, /* useGpu= */
+                false);
+}
+
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(16);
+
+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(16);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+void testPoolLayer(const string& poolType,
+                   bool trans,
+                   bool useGpu,
+                   bool excludeMode = true) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(14);
+  pool->set_img_size_y(14);
+  pool->set_exclude_mode(excludeMode);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool", 100, trans, useGpu);
+}
+
+#ifdef PADDLE_WITH_CUDA
+void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_size_y(4);
+  pool->set_stride_y(3);
+  pool->set_img_size(10);
+  pool->set_img_size_y(20);
+  setPoolConfig(&config, pool, poolType);
+  pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
+                         ((float)pool->stride_y()) +
+                     1.5);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool", 100, trans, useGpu);
+}
+#endif
+
+TEST(Layer, PoolLayer) {
+  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ false,
+                /* excludeMode= */ false);
+  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
+
+#ifdef PADDLE_WITH_CUDA
+  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ true,
+                /* excludeMode= */ false);
+  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-avg-incl-pad-pool",
+                 /* trans= */ false,
+                 /* useGpu= */ true);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void setPool3DConfig(TestConfig* config,
+                     PoolConfig* pool,
+                     const string& poolType) {
+  // filter size
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+  const int CHANNELS = 16;
+
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool3d");
+  (*config).layerConfig.set_num_filters(NUM_FILTERS);
+
+  int kw = FILTER_SIZE, kh = FILTER_SIZE_Y, kd = FILTER_SIZE_Z;
+  int pw = 0, ph = 0, pd = 0;
+  int sw = 2, sh = 2, sd = 2;
+
+  pool->set_pool_type(poolType);
+  pool->set_pool_type("avg");
+  pool->set_channels(CHANNELS);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_size_z(kd);
+  pool->set_padding(0);
+  pool->set_padding_y(0);
+  pool->set_padding_z(0);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+  pool->set_stride_z(sd);
+  pool->set_start(0);
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  int od = outputSize(pool->img_size_z(), kd, pd, sd, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+  pool->set_output_z(od);
+}
+
+void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 11664, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+
+  pool->set_img_size(IMAGE_SIZE);
+  pool->set_img_size_y(IMAGE_SIZE_Y);
+  pool->set_img_size_z(IMAGE_SIZE_Z);
+
+  setPool3DConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool3d", 100, trans, useGpu);
+}
+
+TEST(Layer, Pool3DLayer) {
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void testSppLayer(const string& poolType,
+                  const int pyramidHeight,
+                  bool trans,
+                  bool useGpu) {
+  TestConfig config;
+  config.layerConfig.set_type("spp");
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  SppConfig* sppConfig = input->mutable_spp_conf();
+  sppConfig->set_pool_type(poolType);
+  sppConfig->set_pyramid_height(pyramidHeight);
+  ImageConfig* imageConfig = sppConfig->mutable_image_conf();
+  imageConfig->set_channels(16);
+  imageConfig->set_img_size(10);
+  imageConfig->set_img_size_y(20);
+  int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
+  config.layerConfig.set_size(outputSize * imageConfig->channels());
+  testLayerGrad(config, "spp", 100, trans, useGpu);
+}
+
+TEST(Layer, SpatialPyramidPoolLayer) {
+  for (auto useGpu : {false, true}) {
+    for (auto pyramidHeight : {1, 2, 3}) {
+      testSppLayer("avg-projection", pyramidHeight, false, useGpu);
+      testSppLayer("max-projection", pyramidHeight, false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, rankCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("rank-cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "rank-cost", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, sumCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("sum_cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "sum_cost", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, weightedRankCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("rank-cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_3", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "weighted-rank-cost", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, TensorLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("tensor");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = config.layerConfig.size();
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 250});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 5, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "tensor", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, RecurrentLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("recurrent");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("tanh");
+  config.biasSize = 4;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 4, /* paraSize= */ 16});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(
+          config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
+    }
+  }
+}
+
+TEST(Layer, LstmLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("lstmemory");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("tanh");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 28;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 64});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(
+          config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
+    }
+  }
+  for (auto useGpu : {true}) {
+    config.testBatchState = true;
+    config.layerConfig.set_reversed(false);
+    testLayerGrad(config, "lstmemory", 10, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, MDLstmLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("mdlstmemory");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 4 * 9;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_MDIM_DATA, "layer_0", 4 * 5, 4 * 4 * 5});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_directions(true);
+  config.layerConfig.add_directions(true);
+
+  for (auto useGpu : {false, true}) {
+    for (int i = 0; i < 2; i++) {
+      for (int j = 0; j < 2; j++) {
+        config.layerConfig.set_directions(0, bool(i));
+        config.layerConfig.set_directions(1, bool(j));
+        testLayerGrad(config, "mdlstmemory", 100, false, useGpu);
+      }
+    }
+  }
+}
+
+TEST(Layer, ParameterReluLayer) {
+  auto testParameterReluLayer = [&](size_t inputSize, size_t channels) {
+    TestConfig config;
+    config.layerConfig.set_type("prelu");
+    config.inputDefs.push_back({INPUT_DATA, "layer_0", inputSize, channels});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_size(inputSize);
+    config.layerConfig.set_partial_sum(inputSize /
+                                       channels);  // size of feature map
+    for (auto useGpu : {false, true}) {
+      testLayerGrad(config, "prelu", 100, false, useGpu);
+    }
+  };
+
+  testParameterReluLayer(192, 1);
+  testParameterReluLayer(192, 3);
+  testParameterReluLayer(192, 192);
+}
+
+TEST(Layer, ResizeLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("resize");
+  config.layerConfig.set_size(64);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 16, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "resize", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, RotateLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("rotate");
+  const int CHANNEL = 2;
+  const int HEIGHT = 8;
+  const int WIDTH = 4;
+  const int INPUT_SIZE = HEIGHT * WIDTH * CHANNEL;
+  config.layerConfig.set_size(INPUT_SIZE);
+  config.layerConfig.set_height(HEIGHT);
+  config.layerConfig.set_width(WIDTH);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", INPUT_SIZE, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "rotate", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, NCELayer) {
+  TestConfig config;
+  size_t numClasses = 4;
+  config.layerConfig.set_type("nce");
+  config.layerConfig.set_size(1);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_num_classes(numClasses);
+  config.biasSize = numClasses;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 16 * numClasses});
+  config.inputDefs.push_back(
+      {INPUT_LABEL, "label", /* dim= */ numClasses, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto withWeight : {false, true}) {
+    if (withWeight) {
+      config.inputDefs.push_back(
+          {INPUT_DATA_TARGET, "weight", /* dim= */ 1, /* paraSize= */ 0});
+      config.layerConfig.add_inputs();
+    }
+
+    for (auto isIdLabel : {false, true}) {
+      config.inputDefs[1] = {
+          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
+          "label",
+          /* dim= */ numClasses,
+          /* paraSize= */ 0};
+
+      for (auto withDist : {false, true}) {
+        config.layerConfig.clear_neg_sampling_dist();
+        if (withDist) {
+          double sum = 0;
+          for (size_t i = 0; i < numClasses; ++i) {
+            real p = rand();  // NOLINT use rand_r
+            config.layerConfig.add_neg_sampling_dist(p);
+            sum += p;
+          }
+          for (size_t i = 0; i < numClasses; ++i) {
+            real p = config.layerConfig.neg_sampling_dist(i) / sum;
+            config.layerConfig.set_neg_sampling_dist(i, p);
+          }
+        }
+        LOG(INFO) << "NCELayer "
+                  << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight
+                  << " withDist=" << withDist;
+        // Not support GPU now
+        testLayerGrad(config,
+                      "nce",
+                      100,
+                      /* trans= */ false,
+                      /* useGpu */ false);
+      }
+    }
+  }
+}
+
+TEST(Layer, GatedRecurrentLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("gated_recurrent");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(config, "gated_recurrent", 100, /* trans= */ false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, GruStepLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("gru_step");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "gruStep", 100, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, LstmStepLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("lstm_step");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+  config.testAccumulate = false;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "lstmStep", 100, /* trans= */ false, useGpu);
+  }
+}
+
+void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  const int CHANNELS = 10;
+  const int IMG_SIZE = 16;
+  const int IMG_SIZE_Y = 8;
+  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = CHANNELS;
+  config.inputDefs.push_back({INPUT_DATA,
+                              "layer_0",
+                              /* dim= */ size,
+                              /* paraSize= */ CHANNELS});
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+  img_conf->set_img_size_y(IMG_SIZE_Y);
+
+  testLayerGrad(config,
+                "batch_norm",
+                64,
+                /* trans= */ trans,
+                useGpu,
+                /* useWeight */ true);
+}
+
+TEST(Layer, BatchNormalizationLayer) {
+  testBatchNormLayer("batch_norm", false, false);
+#ifdef PADDLE_WITH_CUDA
+  testBatchNormLayer("batch_norm", false, true);
+  if (hl_get_cudnn_lib_version() >= int(4000)) {
+    testBatchNormLayer("cudnn_batch_norm", false, true);
+  }
+#endif
+}
+
+void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  const int CHANNELS = 10;
+  const int IMG_SIZE = 16;
+  const int IMG_SIZE_Y = 8;
+  const int IMG_SIZE_Z = 8;
+  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y * IMG_SIZE_Z;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = CHANNELS;
+  config.inputDefs.push_back({INPUT_DATA,
+                              "layer_0",
+                              /* dim= */ size,
+                              /* paraSize= */ CHANNELS});
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+  img_conf->set_img_size_y(IMG_SIZE_Y);
+  img_conf->set_img_size_z(IMG_SIZE_Z);
+
+  testLayerGrad(config,
+                "batch_norm",
+                64,
+                /* trans= */ trans,
+                useGpu,
+                /* useWeight */ true);
+}
+
+TEST(Layer, testBatchNorm3DLayer) {
+  testBatchNorm3DLayer("batch_norm", false, false);
+#ifdef PADDLE_WITH_CUDA
+  testBatchNorm3DLayer("batch_norm", false, true);
+  if (hl_get_cudnn_lib_version() >= int(4000)) {
+    testBatchNorm3DLayer("cudnn_batch_norm", false, true);
+  }
+#endif
+}
+
+void testConvOperator(bool isDeconv) {
+  TestConfig config;
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 2;
+  const int FILTER_SIZE_Y = 3;
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 16;
+  const int IMAGE_SIZE_Y = 9;
+  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
+  if (isDeconv) {
+    operatorConf.set_type("convt");
+  } else {
+    operatorConf.set_type("conv");
+  }
+  ConvConfig* conv = operatorConf.mutable_conv_conf();
+  operatorConf.set_num_filters(NUM_FILTERS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_channels(CHANNELS);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_img_size(IMAGE_SIZE);
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /*  caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /*  caffeMode */ true));
+
+  if (isDeconv) {
+    conv->set_filter_channels(NUM_FILTERS / conv->groups());
+    config.inputDefs.push_back({INPUT_DATA,
+                                "layer_0",
+                                conv->output_x() * conv->output_y() * CHANNELS,
+                                0});
+    config.layerConfig.set_size(IMAGE_SIZE * IMAGE_SIZE_Y * NUM_FILTERS);
+  } else {
+    conv->set_filter_channels(conv->channels() / conv->groups());
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
+    config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                                NUM_FILTERS);
+  }
+
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_1",
+       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
+       0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
+}
+
+TEST(Operator, conv) {
+  testConvOperator(/*isDeconv*/ true);
+  testConvOperator(/*isDeconv*/ false);
+}
+
+TEST(Layer, FeatureMapExpandLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("featmap_expand");
+  const int CHANNELS = 10;
+  const int INPUT_SIZE = 100;
+  config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
+  config.layerConfig.set_num_filters(CHANNELS);
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
+                              "layer_0",
+                              /* dim= */ INPUT_SIZE,
+                              /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    for (auto asRowVec : {false, true}) {
+      config.layerConfig.set_user_arg(asRowVec ? "as_row_vec" : "as_col_vec");
+      testLayerGrad(config,
+                    "featmap_expand",
+                    /*batch_size*/ 100,
+                    /* trans= */ false,
+                    useGpu,
+                    /* useWeight */ true);
+    }
+  }
+}
+
+TEST(Layer, MultiplexLayer) {
+  TestConfig config;
+  const int LAYER_SIZE = 100;
+  config.layerConfig.set_type("multiplex");
+  config.layerConfig.set_size(LAYER_SIZE);
+
+  config.inputDefs.push_back({INPUT_LABEL, "layer_0", 2, 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_2", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "multiplex", 512, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, PadLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("pad");
+
+  int c = 4;
+  int h = 31;
+  int w = 36;
+  size_t size = c * h * w;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PadConfig* pad = input->mutable_pad_conf();
+  ImageConfig* image = pad->mutable_image_conf();
+
+  image->set_channels(c);
+  image->set_img_size(h);
+  image->set_img_size_y(w);
+  pad->add_pad_c(1);
+  pad->add_pad_c(2);
+  pad->add_pad_h(2);
+  pad->add_pad_h(3);
+  pad->add_pad_w(3);
+  pad->add_pad_w(5);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "pad", 10, false, useGpu);
+  }
+}
+
+TEST(Layer, CrossChannelNormLayer) {
+  TestConfig config;
+  config.paramInitialMean = 1.;
+  config.paramInitialStd = 0.;
+  config.layerConfig.set_type("norm");
+  config.layerConfig.set_size(100);
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cross-channel-norm");
+  norm->set_channels(10);
+  norm->set_size(100);
+  norm->set_scale(0);
+  norm->set_pow(0);
+  norm->set_blocked(0);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
+  }
+}
+
+TEST(Layer, smooth_l1) {
+  TestConfig config;
+  config.layerConfig.set_type("smooth_l1");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 200, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 200, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false);
+  }
+}
+
+TEST(Layer, multibox_loss) {
+  TestConfig config;
+  config.layerConfig.set_type("multibox_loss");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf();
+  multiboxLoss->set_num_classes(21);
+  multiboxLoss->set_input_num(1);
+  multiboxLoss->set_overlap_threshold(0.5);
+  multiboxLoss->set_neg_pos_ratio(3);
+  multiboxLoss->set_neg_overlap(0.5);
+  multiboxLoss->set_background_id(0);
+  multiboxLoss->set_height(3);
+  multiboxLoss->set_width(3);
+
+  size_t gtNum = 1;
+  MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false);
+  labelValue->randomizeUniform();
+  labelValue->add(-0.5);
+  labelValue->sigmoid(*labelValue);
+  real* labelData = labelValue->getData();
+  size_t labelWidth = labelValue->getWidth();
+  for (size_t i = 0; i < gtNum; ++i) {
+    *(labelData + i * labelWidth) = std::rand() % 20 + 1;
+    *(labelData + i * labelWidth + 1) = 0.400259;
+    *(labelData + i * labelWidth + 2) = 0.377857;
+    *(labelData + i * labelWidth + 3) = 0.525712;
+    *(labelData + i * labelWidth + 4) = 0.519368;
+  }
+  vector<int> seqStartPositions(gtNum + 1, 0);
+  for (size_t i = 1; i <= gtNum; ++i) {
+    seqStartPositions[i] = i;
+  }
+
+  // Ensure at lease one matched bbox
+  MatrixPtr priorValue = Matrix::create(1, 72, false, false);
+  priorValue->randomizeUniform();
+  priorValue->add(-0.5);
+  priorValue->sigmoid(*priorValue);
+  real* priorData = priorValue->getData();
+  *(priorData) = 0.424811;
+  *(priorData + 1) = 0.397059;
+  *(priorData + 2) = 0.538905;
+  *(priorData + 3) = 0.447091;
+  *(priorData + 4) = 0.425720;
+  *(priorData + 5) = 0.515228;
+  *(priorData + 6) = 0.519452;
+  *(priorData + 7) = 0.591065;
+
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}});
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
+  config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
+  config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "multibox_loss", 1, false, useGpu, false);
+  }
+}
+
+TEST(Layer, TransLayer) {
+  TestConfig config;
+  const int height = 128;
+  const int width = 256;
+  config.layerConfig.set_type("trans");
+  config.layerConfig.set_size(width);
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ height * width, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "trans", height, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, RowConvLayer) {
+  const int context = 3;
+  const int size = 512;
+
+  TestConfig config;
+  config.layerConfig.set_type("row_conv");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", size, context * size});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  RowConvConfig* conv = input->mutable_row_conv_conf();
+  conv->set_context_length(context);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_conv", 100, false, useGpu, false);
+  }
+}
+
+TEST(Layer, CropLayer) {
+  TestConfig config;
+  // config input_0
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ImageConfig* img = input->mutable_image_conf();
+  img->set_channels(4);
+  img->set_img_size(16);
+  config.layerConfig.set_axis(2);
+  config.layerConfig.add_offset(0);
+  config.layerConfig.add_offset(0);
+
+  // config input_1
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 128, 0});
+  input = config.layerConfig.add_inputs();
+  img = input->mutable_image_conf();
+  img->set_channels(2);
+  img->set_img_size(8);
+
+  // config crop layer
+  config.layerConfig.set_type("crop");
+  config.layerConfig.set_name("cropLayer");
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "crop", 100, false, useGpu, false);
+  }
+}
+
+TEST(Layer, roi_pool) {
+  TestConfig config;
+  config.layerConfig.set_type("roi_pool");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
+  roiPoolConf->set_pooled_width(7);
+  roiPoolConf->set_pooled_height(7);
+  roiPoolConf->set_spatial_scale(1. / 16);
+  roiPoolConf->set_width(14);
+  roiPoolConf->set_height(14);
+
+  const size_t roiNum = 10;
+  const size_t roiDim = 10;
+  const size_t batchSize = 5;
+  MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
+  roiValue->zeroMem();
+  real* roiData = roiValue->getData();
+  for (size_t i = 0; i < roiNum; ++i) {
+    roiData[i * roiDim + 0] = std::rand() % batchSize;
+    roiData[i * roiDim + 1] = std::rand() % 224;  // xMin
+    roiData[i * roiDim + 2] = std::rand() % 224;  // yMin
+    size_t xMin = static_cast<size_t>(roiData[i * roiDim + 1]);
+    size_t yMin = static_cast<size_t>(roiData[i * roiDim + 2]);
+    roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin);  // xMax
+    roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin);  // yMax
+  }
+
+  config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, SwitchOrderLayer) {
+  TestConfig config;
+  // config input_0
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ImageConfig* img = input->mutable_image_conf();
+  img->set_channels(4);
+  img->set_img_size(16);
+  img->set_img_size_y(16);
+
+  ReshapeConfig* reshape = config.layerConfig.mutable_reshape_conf();
+  reshape->add_height_axis(0);
+  reshape->add_height_axis(1);
+  reshape->add_height_axis(2);
+  reshape->add_width_axis(3);
+
+  // config softmax layer
+  config.layerConfig.set_type("switch_order");
+  config.layerConfig.set_name("switchOrderLayer");
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "switch_order", 100, false, useGpu, true);
+  }
+}
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+TEST(Layer, SubNestedSequenceLayer) {
+  // layer size is not crutial for this layer,
+  // so use a small layer size in unittest
+  const int layerSize = 4;
+
+  const int maxSeqNum = 50;
+  const int maxSeqLen = 50;
+  const int maxBeamSize = 32;
+
+  srand((size_t)(time(NULL)));
+  int beamSize = 1 + (rand() % maxBeamSize);
+
+  TestConfig config;
+  config.layerConfig.set_type("sub_nested_seq");
+  config.layerConfig.set_name("sub_nested_seq_layer");
+  config.layerConfig.set_size(layerSize);
+
+  int seqNum = 1 + (rand() % maxSeqNum);
+
+  // sequence information for the first input, it is a nested sequence
+  vector<int> seqStartPos(seqNum + 1, 0);
+  vector<int> subSeqStartPos(1, 0);
+
+  // selected indices
+  MatrixPtr selectedIndices = Matrix::create(seqNum, beamSize, false, false);
+  selectedIndices->one();
+  selectedIndices->mulScalar(-1.);
+  real* indicesData = selectedIndices->getData();
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqNum; ++j) {
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % maxSeqLen)));
+    }
+    vector<real> selSeqs =
+        randSampling(static_cast<real>(subSeqNum), min(beamSize, subSeqNum));
+    memcpy(indicesData + (i * beamSize),
+           selSeqs.data(),
+           selSeqs.size() * sizeof(real));
+    seqStartPos[i + 1] = subSeqStartPos.back();
+  }
+
+  MatrixPtr seqInputPtr =
+      Matrix::create(seqStartPos.back(), layerSize, false, false);
+  seqInputPtr->randomizeUniform();
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                              "nested_seq_input",
+                              seqInputPtr,
+                              seqStartPos,
+                              subSeqStartPos});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "selected_indices", selectedIndices});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "sub_nested_seq",
+                  /* batchSize */ seqNum,
+                  /* trans */ false,
+                  /* useGpu*/ useGpu,
+                  /* useWeight */ false);
+  }
+}
+
+TEST(Layer, ClipLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("clip");
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ClipConfig* layerConf = input->mutable_clip_conf();
+  double p1 = std::rand() / (double)RAND_MAX;
+  double p2 = std::rand() / (double)RAND_MAX;
+  layerConf->set_min(std::min(p1, p2));
+  layerConf->set_max(std::max(p1, p2));
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "clip", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, RowL2NormLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("row_l2_norm");
+  config.layerConfig.set_size(size);
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
+  }
+}
+
+void test3DConvLayer(const string& type, bool trans, bool useGpu) {
+  // filter size
+  const int NUM_FILTERS = 6;
+  // const int CHANNELS = 3;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+
+  // input image
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+
+  TestConfig config;
+  config.biasSize = NUM_FILTERS;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(NUM_FILTERS);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  // Setting up conv3D-trans layer
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+
+  conv->set_channels(CHANNELS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_filter_size_z(FILTER_SIZE_Z);
+  conv->set_padding(0);
+  conv->set_padding_y(0);
+  conv->set_padding_z(0);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_stride_z(2);
+  conv->set_img_size(IMAGE_SIZE);
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_img_size_z(IMAGE_SIZE_Z);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /*  caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /*  caffeMode */ true));
+  conv->set_output_z(outputSize(conv->img_size_z(),
+                                conv->filter_size_z(),
+                                conv->padding_z(),
+                                conv->stride_z(),
+                                /*  caffeMode */ true));
+
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              conv->output_z() * NUM_FILTERS);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
+       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
+           NUM_FILTERS});
+
+  testLayerGrad(config, "conv3D", 10, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "conv3D", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, test3DConvLayer) {
+  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
+  // filter size
+  const int NUM_FILTERS = 6;
+  // const int CHANNELS = 3;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+
+  // input image
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 4;
+  const int IMAGE_SIZE_Y = 6;
+  const int IMAGE_SIZE_Z = 6;
+
+  // Setting up conv-trans layer
+  TestConfig config;
+  config.biasSize = NUM_FILTERS;
+  config.layerConfig.set_type("deconv3d");
+  config.layerConfig.set_num_filters(NUM_FILTERS);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+
+  conv->set_channels(CHANNELS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_filter_size_z(FILTER_SIZE_Z);
+  conv->set_padding(0);
+  conv->set_padding_y(0);
+  conv->set_padding_z(0);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_stride_z(2);
+  conv->set_output_x(IMAGE_SIZE);
+  conv->set_output_y(IMAGE_SIZE_Y);
+  conv->set_output_z(IMAGE_SIZE_Z);
+
+  conv->set_img_size(imageSize(conv->output_x(),
+                               conv->filter_size(),
+                               conv->padding(),
+                               conv->stride(),
+                               true));
+  conv->set_img_size_y(imageSize(conv->output_y(),
+                                 conv->filter_size_y(),
+                                 conv->padding_y(),
+                                 conv->stride_y(),
+                                 true));
+  conv->set_img_size_z(imageSize(conv->output_z(),
+                                 conv->filter_size_z(),
+                                 conv->padding_z(),
+                                 conv->stride_z(),
+                                 true));
+  config.layerConfig.set_size(conv->img_size() * conv->img_size_y() *
+                              conv->img_size_z() * NUM_FILTERS);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
+       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
+           NUM_FILTERS});
+
+  testLayerGrad(config, "deconv3D", 10, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "deconv3D", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, test3DDeConvLayer) {
+  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+TEST(Layer, ScaleShiftLayer) {
+  // FIXME: Disable ScaleShiftLayer because it is not stable.
+  // https://github.com/PaddlePaddle/Paddle/issues/7781
+  return;
+  //  const size_t batchSize = 16;
+  //  const size_t size = 32;
+  //  TestConfig config;
+  //  config.layerConfig.set_type("scale_shift");
+  //  config.layerConfig.set_size(size);
+  //  config.biasSize = 1;
+  //  config.inputDefs.push_back(
+  //      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
+  //  config.layerConfig.add_inputs();
+  //  for (auto useGpu : {false, true}) {
+  //    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
+  //  }
+}
+
+TEST(Layer, ScaleSubRegionLayer) {
+  const size_t batchSize = 64;
+  const size_t size = 4096;
+  TestConfig config;
+  config.layerConfig.set_type("scale_sub_region");
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
+  auto* data = indicesV->getData();
+  for (size_t i = 0; i < batchSize; ++i) {
+    data[i * 2] = 2;
+    data[i * 2 + 1] = 4;
+    data[i * 2 + 2] = 16;
+    data[i * 2 + 3] = 32;
+    data[i * 2 + 4] = 16;
+    data[i * 2 + 5] = 32;
+  }
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ScaleSubRegionConfig* scaleSubRegionConf =
+      input->mutable_scale_sub_region_conf();
+  ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
+  imgConf->set_img_size(32);
+  imgConf->set_img_size_y(32);
+  imgConf->set_channels(4);
+  scaleSubRegionConf->set_value(2.0);
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, L2DistanceLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("l2_distance");
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+
+  const size_t input_dim = 27;
+  const size_t batch_size = 11;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", input_dim, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", input_dim, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "l2_distance", batch_size, false, useGpu);
+  }
+}
+
+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+  const int FACTOR_SIZE = 10;
+  TestConfig config;
+  config.layerConfig.set_type("factorization_machine");
+  config.layerConfig.set_factor_size(FACTOR_SIZE);
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+  config.inputDefs.push_back({type, "layer_0", 128, 1280});
+  config.layerConfig.add_inputs();
+  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+
+TEST(Layer, FactorizationMachineLayer) {
+  for (auto useGpu : {false, true}) {
+    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+  }
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp b/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c9549255180294bf5d12c7085d102fd1851b2d5
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/legacy/gserver/layers/LinearChainCRF.h"
+#include "paddle/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static inline bool getNextSequence(vector<int>& seq, int numClasses) {
+  for (auto& v : seq) {
+    if (++v < numClasses) {
+      return true;
+    }
+    v = 0;
+  }
+  return false;
+}
+
+TEST(LinearChainCRF, decoding) {
+  const int numClasses = 4;
+  CpuVector para(numClasses * (numClasses + 2));
+  real* a = para.getData();
+  real* b = para.getData() + numClasses;
+  real* w = para.getData() + 2 * numClasses;
+  LinearChainCRF crf(4, para.getData());
+  for (int length : {1, 2, 3, 10}) {
+    for (int tries = 0; tries < 10; ++tries) {
+      CpuMatrix x(length, numClasses);
+      x.randomizeUniform();
+      para.randnorm(0, 2);
+      vector<int> decodingResult(length);
+      vector<int> bestResult(length);
+      vector<int> testResult(length, 0);
+      crf.decode(x.getData(), &decodingResult[0], length);
+      real bestScore = -std::numeric_limits<real>::max();
+      do {
+        real score = a[testResult.front()] + b[testResult.back()];
+        score += x.getElement(0, testResult.front());
+        for (int k = 1; k < length; ++k) {
+          score += x.getElement(k, testResult[k]) +
+                   w[numClasses * testResult[k - 1] + testResult[k]];
+        }
+        if (score > bestScore) {
+          bestScore = score;
+          bestResult = testResult;
+        }
+      } while (getNextSequence(testResult, numClasses));
+      for (int k = 0; k < length; ++k) {
+        EXPECT_EQ(decodingResult[k], bestResult[k]);
+      }
+    }
+  }
+}
diff --git a/paddle/legacy/gserver/tests/test_MKLDNN.cpp b/paddle/legacy/gserver/tests/test_MKLDNN.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a20ccfb772dcd119ef14f640dcde7259e4b4de79
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_MKLDNN.cpp
@@ -0,0 +1,448 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/utils/PythonUtil.h>
+#include <string>
+#include <vector>
+#include "MKLDNNTester.h"
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/activations/MKLDNNActivation.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_bool(use_mkldnn);
+
+#define RUN_MKLDNN_TEST(DNN_CONFIG, REF_CONFIG, DESC)         \
+  MKLDNNTester tester;                                        \
+  for (auto bs : {DESC.bs, 1}) {                              \
+    tester.run(DNN_CONFIG, REF_CONFIG, bs, DESC.ih, DESC.iw); \
+  }
+
+#define RUN_MKLDNN_TEST_LAYER(DNN_CONFIG, REF_TYPE, DESC) \
+  TestConfig ref = DNN_CONFIG;                            \
+  ref.layerConfig.set_type(REF_TYPE);                     \
+  RUN_MKLDNN_TEST(DNN_CONFIG, ref, DESC)
+
+struct testFcDesc {
+  int bs;
+  int ic;
+  int ih, iw;  // oh == ow == 1
+  int oc;
+};
+
+static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_fc");
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.set_size(pm.oc);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.oc * pm.ic * pm.ih * pm.iw)});
+  cfg.layerConfig.add_inputs();
+}
+
+void testFcLayer(const testFcDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNFcConfig(dnnConfig, pm);
+  for (auto biasSize : {pm.oc, 0}) {
+    dnnConfig.biasSize = biasSize;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "fc", pm)
+  }
+}
+
+TEST(MKLDNNLayer, FcLayer) {
+  /* bs, ic, ih, iw, oc */
+  testFcLayer({2, 2, 1, 1, 3});
+  testFcLayer({3, 7, 1, 1, 19});
+  testFcLayer({8, 16, 13, 13, 32});
+  testFcLayer({4, 12, 13, 13, 18});
+  testFcLayer({2, 64, 16, 16, 32});
+  testFcLayer({15, 3, 16, 16, 6});
+}
+
+struct testConvDesc {
+  int bs, gp;
+  int ic, ih, iw;
+  int oc, oh, ow;
+  int fh, fw;
+  int ph, pw;
+  int sh, sw;
+  int dh, dw;
+};
+
+static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_conv");
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.set_num_filters(pm.oc);
+  cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
+  cfg.layerConfig.set_shared_biases(true);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.oc * pm.ic * pm.fh * pm.fw / pm.gp)});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_groups(pm.gp);
+  conv->set_img_size(pm.iw);
+  conv->set_img_size_y(pm.ih);
+  conv->set_output_x(pm.ow);
+  conv->set_output_y(pm.oh);
+  conv->set_filter_size(pm.fw);
+  conv->set_filter_size_y(pm.fh);
+  conv->set_channels(pm.ic);
+  conv->set_padding(pm.pw);
+  conv->set_padding_y(pm.ph);
+  conv->set_stride(pm.sw);
+  conv->set_stride_y(pm.sh);
+  conv->set_dilation(pm.dw);
+  conv->set_dilation_y(pm.dh);
+  conv->set_caffe_mode(true);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  CHECK_EQ(conv->filter_channels() * pm.gp, conv->channels())
+      << "it is indivisible";
+
+  int fh = (pm.fh - 1) * pm.dh + 1;
+  int fw = (pm.fw - 1) * pm.dw + 1;
+  int ow = outputSize(pm.iw, fw, pm.pw, pm.sw, true);
+  int oh = outputSize(pm.ih, fh, pm.ph, pm.sh, true);
+  CHECK_EQ(ow, pm.ow) << "output size check failed";
+  CHECK_EQ(oh, pm.oh) << "output size check failed";
+}
+
+void testConvLayer(const testConvDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNConvConfig(dnnConfig, pm);
+  for (auto biasSize : {pm.oc, 0}) {
+    dnnConfig.biasSize = biasSize;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "exconv", pm)
+  }
+}
+
+TEST(MKLDNNLayer, ConvLayer) {
+  /* bs, gp, ic, ih, iw, oc, oh, ow, fh, fw, ph, pw, sh, sw, dh, dw */
+  testConvLayer({2, 1, 3, 32, 32, 16, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({2, 1, 8, 16, 16, 8, 16, 16, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({3, 1, 16, 32, 32, 3, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({8, 1, 16, 18, 18, 32, 18, 18, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({16, 1, 1, 42, 31, 32, 23, 11, 4, 5, 3, 2, 2, 3, 1, 1});
+  testConvLayer({2, 1, 8, 16, 16, 8, 8, 8, 3, 3, 1, 1, 2, 2, 1, 1});
+  testConvLayer({3, 1, 8, 13, 13, 8, 7, 7, 3, 3, 1, 1, 2, 2, 1, 1});
+  // with groups
+  testConvLayer({2, 2, 4, 5, 5, 8, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({2, 3, 3, 5, 5, 3, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
+}
+
+struct testPoolDesc {
+  int bs, ic;  // input channel and output channel are the same
+  int ih, iw;
+  int oh, ow;
+  int fh, fw;
+  int ph, pw;
+  int sh, sw;
+};
+
+static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_pool");
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+  pool->set_pool_type("avg-projection");
+  pool->set_channels(pm.ic);
+  pool->set_img_size(pm.iw);
+  pool->set_img_size_y(pm.ih);
+  pool->set_output_x(pm.ow);
+  pool->set_output_y(pm.oh);
+  pool->set_size_x(pm.fw);
+  pool->set_size_y(pm.fh);
+  pool->set_padding(pm.pw);
+  pool->set_padding_y(pm.ph);
+  pool->set_stride(pm.sw);
+  pool->set_stride_y(pm.sh);
+
+  int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false);
+  int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
+  CHECK_EQ(ow, pm.ow) << "output size check failed";
+  CHECK_EQ(oh, pm.oh) << "output size check failed";
+}
+
+void testPoolLayer(const testPoolDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNPoolConfig(dnnConfig, pm);
+  LayerInputConfig* input = dnnConfig.layerConfig.mutable_inputs(0);
+  PoolConfig* pool = input->mutable_pool_conf();
+  for (auto type : {"max-projection", "avg-projection"}) {
+    pool->set_pool_type(type);
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "pool", pm)
+  }
+}
+
+TEST(MKLDNNLayer, PoolLayer) {
+  /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw */
+  testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
+  testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
+  testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1});
+  testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1});
+  testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
+}
+
+struct testBatchNormDesc {
+  int bs;
+  int ic;
+  int ih, iw;
+};
+
+static void getMKLDNNBatchNormConfig(TestConfig& cfg,
+                                     const testBatchNormDesc& pm) {
+  cfg.layerConfig.set_size(pm.ic * pm.ih * pm.iw);
+  cfg.layerConfig.set_type("mkldnn_batch_norm");
+  cfg.biasSize = pm.ic;
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.ic)});
+  cfg.inputDefs.push_back(
+      {INPUT_DATA, "layer_1_moving_mean", 1, size_t(pm.ic)});
+  cfg.inputDefs.back().isStatic = true;
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_2_moving_var", 1, size_t(pm.ic)});
+  cfg.inputDefs.back().isStatic = true;
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.add_inputs();
+  cfg.layerConfig.add_inputs();
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(pm.ic);
+  img_conf->set_img_size_y(pm.ih);
+  img_conf->set_img_size(pm.iw);
+}
+
+void testBatchNormLayer(const testBatchNormDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNBatchNormConfig(dnnConfig, pm);
+  TestConfig refConfig = dnnConfig;
+  refConfig.layerConfig.set_type("batch_norm");
+  // for PASS_TRAIN, use_global_stats always should be false, and batchsize != 1
+  VLOG(MKLDNN_TESTS) << "check train phase";
+  dnnConfig.layerConfig.set_use_global_stats(false);
+  refConfig.layerConfig.set_use_global_stats(false);
+  MKLDNNTester tester;
+  tester.run(dnnConfig, refConfig, pm.bs, pm.ih, pm.iw, PASS_TRAIN);
+  // for PASS_TEST, check use_global_stats true and false, and batchsize 1
+  VLOG(MKLDNN_TESTS) << "check test phase";
+  for (auto useGS : {false, true}) {
+    dnnConfig.layerConfig.set_use_global_stats(useGS);
+    refConfig.layerConfig.set_use_global_stats(useGS);
+    MKLDNNTester tester;
+    for (auto bs : {pm.bs, 1}) {
+      tester.run(dnnConfig, refConfig, bs, pm.ih, pm.iw, PASS_TEST);
+    }
+  }
+}
+
+TEST(MKLDNNLayer, BatchNormLayer) {
+  testBatchNormLayer({4, 10, 6, 6});
+  testBatchNormLayer({16, 32, 16, 16});
+  testBatchNormLayer({4, 16, 8, 10});
+}
+
+struct testLRNDesc {
+  int bs, ic, ih, iw;
+  float scale, pow;
+  int localSize;
+};
+
+void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_lrn");
+  cfg.layerConfig.set_active_type("relu");
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_channels(pm.ic);
+  norm->set_size(pm.localSize);
+  norm->set_scale(pm.scale);
+  norm->set_pow(pm.pow);
+  norm->set_blocked(0);
+  norm->set_img_size(pm.iw);
+  norm->set_img_size_y(pm.ih);
+  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
+  cfg.layerConfig.set_size(layerSize);
+  cfg.biasSize = 0;
+}
+
+void testLRNLayer(const testLRNDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNLRNConfig(dnnConfig, pm);
+  // mkldnn_lrn <==> norm with cmrnorm-projection type
+  TestConfig refConfig = dnnConfig;
+  refConfig.layerConfig.set_type("norm");
+  LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cmrnorm-projection");
+  norm->set_scale(norm->scale() / norm->size());
+  RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
+}
+
+TEST(MKLDNNLayer, LRNLayer) {
+  testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
+  testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
+  testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
+}
+
+struct testImageDesc {
+  int bs, ic, ih, iw;
+};
+
+static void getAddtoConfig(TestConfig& cfg,
+                           const testImageDesc& pm,
+                           const size_t nInputs = 1) {
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("addto");
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
+  cfg.layerConfig.set_size(layerSize);
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < nInputs; ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(pm.ic);
+    img_conf->set_img_size_y(pm.ih);
+    img_conf->set_img_size(pm.iw);
+  }
+}
+
+void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
+  CHECK_GE(nInputs, 1UL);
+  TestConfig dnnConfig;
+  getAddtoConfig(dnnConfig, pm, nInputs);
+  dnnConfig.layerConfig.set_type("mkldnn_addto");
+  for (auto withBias : {false, true}) {
+    dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
+  }
+}
+
+TEST(MKLDNNLayer, AddtoLayer) {
+  testAddtoLayer({16, 5, 14, 14}, 1);
+  testAddtoLayer({8, 10, 8, 8}, 2);
+  testAddtoLayer({4, 12, 1, 1}, 3);
+}
+
+static void getMKLDNNConcatConfig(TestConfig& cfg,
+                                  const std::vector<testImageDesc>& inputs) {
+  CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
+  int oc = inputs[0].ic;
+  for (size_t i = 1; i < inputs.size(); ++i) {
+    CHECK_EQ(inputs[i].bs, inputs[0].bs);
+    CHECK_EQ(inputs[i].ih, inputs[0].ih);
+    CHECK_EQ(inputs[i].iw, inputs[0].iw);
+    oc += inputs[i].ic;
+  }
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("mkldnn_concat");
+  cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back(
+        {INPUT_DATA,
+         ss.str(),
+         (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
+         0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(inputs[i].ic);
+    img_conf->set_img_size_y(inputs[i].ih);
+    img_conf->set_img_size(inputs[i].iw);
+  }
+}
+
+void testConcatLayer(const std::vector<testImageDesc>& inputs) {
+  TestConfig dnnConfig;
+  getMKLDNNConcatConfig(dnnConfig, inputs);
+  RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
+}
+
+TEST(MKLDNNLayer, ConcatLayer) {
+  testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
+  testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
+}
+
+void testActivation(std::string actType, const testImageDesc& pm) {
+  // TODO(TJ): remove me when paddle support elu activation
+  if (actType == "mkldnn_elu") {
+    return;
+  }
+  const std::string compareTypes[] = {actType, actType.erase(0, 7)};
+  TestConfig cfg;
+  getAddtoConfig(cfg, pm);
+  TestConfig ref = cfg;
+  cfg.layerConfig.set_active_type(compareTypes[0]);
+  ref.layerConfig.set_active_type(compareTypes[1]);
+  RUN_MKLDNN_TEST(cfg, ref, pm)
+}
+
+TEST(MKLDNNActivation, Activations) {
+  auto types = MKLDNNActivation::getAllRegisteredTypes();
+  for (auto type : types) {
+    /* bs, c, h, w*/
+    testActivation(type, {16, 64, 32, 32});
+    testActivation(type, {2, 8, 1, 1});
+  }
+}
+
+DECLARE_string(config_args);
+TEST(MKLDNNNet, net) {
+  std::vector<std::string> cases = {"simple", "branch"};
+  for (auto name : cases) {
+    std::string config = "./legacy/gserver/tests/mkldnn_" + name + "_net.conf";
+    for (auto channels : {2, 32}) {
+      std::ostringstream oss;
+      oss << "channels=" << channels;
+      FLAGS_config_args = oss.str();
+      MKLDNNTester::runNetTest(config);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  FLAGS_use_gpu = false;
+  FLAGS_use_mkldnn = true;
+  initMain(argc, argv);
+  initPython(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp b/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bc261b4a87ce7f1f4ce1c936ee4151d75e17f3f
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;
+
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(1);
+
+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(1);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
+                                       const string& poolType,
+                                       bool use_gpu,
+                                       MatrixPtr& maskMat) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(5);
+  pool->set_img_size_y(5);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  config.layerConfig.set_name("MaxPoolWithMask");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+
+  initDataLayer(config,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "MaxPoolWithMask",
+                1,
+                false,
+                use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
+
+  FLAGS_use_gpu = use_gpu;
+  std::vector<ParameterPtr> parameters;
+  LayerPtr maxPoolingWithMaskOutputLayer;
+  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
+  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
+
+  checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
+                   maskMat);
+}
+
+TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
+  bool useGpu = false;
+  MatrixPtr inputMat;
+  MatrixPtr maskMat;
+  real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
+                      0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
+                      0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
+  real maskData[] = {12, 4, 22, 24};
+
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->setData(inputData);
+  maskMat->setData(maskData);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#ifdef PADDLE_WITH_CUDA
+  useGpu = true;
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->copyFrom(inputData, 25);
+  maskMat->copyFrom(maskData, 4);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#endif
+}
diff --git a/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp b/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ca1a588d83acc76bf59a8edfaaf51828dc4a569a
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+
+#include <gtest/gtest.h>
+#include <vector>
+
+#undef PADDLE_DISABLE_TIMER
+#include "paddle/utils/Stat.h"
+
+#include "paddle/legacy/gserver/layers/MultinomialSampler.h"
+#include "paddle/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+class MultinomialSamplerTester : public MultinomialSampler {
+ public:
+  MultinomialSamplerTester(real* prob, int size)
+      : MultinomialSampler(prob, size) {}
+
+  template <typename Rand1>
+  int testGen(Rand1 rand1) {
+    return gen1(rand1);
+  }
+};
+
+TEST(MultinomialSampler, gen) {
+  int numGrids = 1024 * 1024;
+  int size = 1024 * 4;
+  default_random_engine reng;
+
+  for (size_t iter = 0; iter < 256; ++iter) {
+    uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
+    vector<real> prob;
+    int sum = 0;
+    for (int i = 0; i < size; ++i) {
+      prob.push_back(rand(reng));
+      sum += prob.back();
+    }
+
+    CHECK_LE(sum, numGrids);
+    prob.back() += numGrids - sum;
+
+    vector<int> counts(size);
+    MultinomialSamplerTester sampler(&prob[0], size);
+    counts.assign(size, 0);
+    {
+      double s = (double)size / (double)numGrids;
+      REGISTER_TIMER("MultinomialSampler");
+      for (double i = 0; i < numGrids; ++i) {
+        int ret = sampler.testGen([i, s]() { return s * i; });
+        if (ret < 0 || ret >= size) {
+          EXPECT_GE(ret, 0);
+          EXPECT_LT(ret, size);
+          break;
+        }
+        ++counts[ret];
+      }
+    }
+    for (int i = 0; i < size; ++i) {
+      if (prob[i] != counts[i]) {
+        EXPECT_EQ(prob[i], counts[i]);
+        LOG(INFO) << iter;
+        break;
+      }
+    }
+  }
+}
+
+void benchmarkRandom() {
+  int n = 1024 * 1024;
+
+  int sum;
+  double sum1;
+
+  sum = 0;
+  unsigned int seed = 1;
+  {
+    REGISTER_TIMER("crand");
+    for (int i = 0; i < n; ++i) {
+      sum += rand_r(&seed) % 1000;
+    }
+  }
+  LOG(INFO) << "sum=" << sum;
+
+  default_random_engine reng;
+  uniform_int_distribution<int> rand(1, 1000);
+  sum = 0;
+  {
+    REGISTER_TIMER("stdrand");
+    for (int i = 0; i < n; ++i) {
+      sum += rand(reng);
+    }
+  }
+  LOG(INFO) << "sum=" << sum;
+
+  sum = 0;
+  {
+    REGISTER_TIMER("default_random_engine");
+    for (int i = 0; i < n; ++i) {
+      sum += reng();
+    }
+  }
+  LOG(INFO) << "sum=" << sum;
+
+  uniform_real_distribution<double> rand1(0, 1);
+  sum1 = 0;
+  {
+    REGISTER_TIMER("stdrand1");
+    for (int i = 0; i < n; ++i) {
+      sum1 += rand1(reng);
+    }
+  }
+  LOG(INFO) << "sum1=" << sum1;
+
+  sum1 = 0;
+  {
+    real a = 1.0f / (real)RAND_MAX;
+    REGISTER_TIMER("crand1");
+    for (int i = 0; i < n; ++i) {
+      sum1 += a * rand_r(&seed);
+    }
+  }
+  LOG(INFO) << "sum1=" << sum1;
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  benchmarkRandom();
+  int ret = RUN_ALL_TESTS();
+  globalStat.printSegTimerStatus();
+  return ret;
+}
diff --git a/paddle/legacy/gserver/tests/test_NetworkCompare.cpp b/paddle/legacy/gserver/tests/test_NetworkCompare.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5a6b2245830832c1ca60ec657231c1bc2900f158
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_NetworkCompare.cpp
@@ -0,0 +1,294 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#undef PADDLE_DISABLE_TIMER
+#include <gtest/gtest.h>
+#include <paddle/utils/PythonUtil.h>
+#include <algorithm>
+#include <cstdlib>
+
+#include "paddle/testing/TestUtil.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/Stat.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DEFINE_bool(use_label, true, "input label or sequence label");
+DEFINE_bool(static_para, false, "static parameter");
+
+struct DataIn {
+  std::vector<Argument> inArgs;
+  std::vector<MatrixPtr> outGrads;
+  std::vector<VectorPtr> paraValues;
+};
+
+struct DataOut {
+  std::vector<MatrixPtr> outValues;
+  std::vector<VectorPtr> paraGrads;
+};
+
+void initArgument(DataIn& data,
+                  const std::string& configPath,
+                  bool useGpu = FLAGS_use_gpu) {
+  TrainerConfigHelper config(configPath);
+  size_t batchSize = config.getOptConfig().batch_size();
+
+  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    Argument arg;
+    arg.value = Matrix::create(batchSize, layerSize, false, useGpu);
+    arg.grad = Matrix::create(batchSize, layerSize, false, useGpu);
+    arg.value->randomizeUniform();
+    arg.value->add(-0.5);
+    arg.value->sigmoid(*arg.value);
+    arg.grad->zeroMem();
+    if (FLAGS_use_label) {
+      arg.ids = VectorT<int>::create(batchSize, useGpu);
+      arg.ids->rand(layerSize);
+    }
+    generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
+    data.inArgs.push_back(arg);
+  }
+
+  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    MatrixPtr grad = Matrix::create(batchSize, layerSize, false, useGpu);
+    grad->randomizeUniform();
+    data.outGrads.push_back(grad);
+  }
+
+  for (const auto& para_config : config.getModelConfig().parameters()) {
+    VectorPtr value = Vector::create(para_config.size(), useGpu);
+    value->randnorm(0, 2);
+    data.paraValues.push_back(value);
+  }
+}
+
+void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
+  *ThreadLocalRand::getSeed() = 0;
+  srand(0);
+
+  Trainer trainer;
+  auto config = std::make_shared<TrainerConfigHelper>(configPath);
+  trainer.init(config, false);
+
+  std::vector<ParameterPtr> parameters;
+  vector<Argument> outArgs;
+
+  auto gradientMachine = trainer.getGradientMachine();
+  parameters = gradientMachine->getParameters();
+  if (FLAGS_static_para) {
+    for (size_t i = 0; i < parameters.size(); i++) {
+      parameters[i]->getBuf(PARAMETER_VALUE)->one();
+    }
+  } else {
+    for (size_t i = 0; i < in.paraValues.size(); i++) {
+      parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
+    }
+  }
+  gradientMachine->start();
+  gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
+  for (size_t i = 0; i < in.outGrads.size(); i++) {
+    // If the all the layers in the config have no parameters, also
+    // not set NeedGradient(), the outArgs[i] will be nullptr.
+    outArgs[i].grad->copyFrom(*in.outGrads[i]);
+  }
+  gradientMachine->backward();
+  for (size_t i = 0; i < in.outGrads.size(); i++) {
+    MatrixPtr value = Matrix::create(outArgs[i].value->getHeight(),
+                                     outArgs[i].value->getWidth(),
+                                     false,
+                                     false);
+    value->copyFrom(*outArgs[i].value);
+    out.outValues.push_back(value);
+  }
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    VectorPtr grad = Vector::create(
+        parameters[i]->getBuf(PARAMETER_GRADIENT)->getSize(), false);
+    grad->copyFrom(*parameters[i]->getBuf(PARAMETER_GRADIENT));
+    out.paraGrads.push_back(grad);
+  }
+
+  for (int i = 0; i < 20; i++) {
+    REGISTER_TIMER("forward");
+    gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
+  }
+  for (int i = 0; i < 20; i++) {
+    REGISTER_TIMER("backward");
+    gradientMachine->backward();
+  }
+
+  gradientMachine->finish();
+}
+
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
+  int nNum = 0;
+  for (size_t i = 0; i < len; ++i) {
+    real diff = fabs(A[i] - B[i]);
+    if (diff > 0.0f &&
+        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_checkgrad_eps) {
+      nNum++;
+      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
+                << "    " << desB << " : " << B[i];
+    }
+  }
+  EXPECT_EQ(0, nNum);
+}
+
+void compareGradient(DataOut& outA, DataOut& outB) {
+  LOG(INFO) << "------------------------------"
+            << " Check Network Output "
+            << "------------------------------";
+  for (size_t i = 0; i < outA.outValues.size(); ++i) {
+    LOG(INFO) << "OUTPUT VALUE: " << i;
+    checkBuffer(outA.outValues[i]->getData(),
+                "network A output",
+                outB.outValues[i]->getData(),
+                "network B output",
+                outA.outValues[i]->getElementCnt(),
+                outA.outValues[i]->getWidth());
+  }
+
+  if (!FLAGS_static_para) {
+    LOG(INFO) << "------------------------------"
+              << " Check Parameters "
+              << "------------------------------";
+    for (size_t i = 0; i < outA.paraGrads.size(); ++i) {
+      LOG(INFO) << "PARAMETER GRADIENT: " << i;
+      checkBuffer(outA.paraGrads[i]->getData(),
+                  "Network A",
+                  outB.paraGrads[i]->getData(),
+                  "Network B",
+                  outA.paraGrads[i]->getSize());
+    }
+  }
+}
+
+void compareNetwork(const std::string& config_file_a,
+                    const std::string& config_file_b) {
+  DataIn in;
+  initArgument(in, config_file_a);
+
+  DataOut dataA;
+  calcGradient(in, dataA, config_file_a);
+  LOG(INFO) << "forwardBackward of Network A is finished";
+  globalStat.printSegTimerStatus();
+  globalStat.reset();
+  LOG(INFO) << "\n\n";
+
+  DataOut dataB;
+  calcGradient(in, dataB, config_file_b);
+  LOG(INFO) << "forwardBackward of the Network B is finished";
+  globalStat.printSegTimerStatus();
+  globalStat.reset();
+  LOG(INFO) << "\n\n";
+
+  compareGradient(dataA, dataB);
+}
+
+TEST(Compare, concat_dotmul) {
+  std::string config_file_a = "./legacy/gserver/tests/concat_dotmul_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/concat_dotmul_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+TEST(Compare, concat_fullmatrix) {
+  std::string config_file_a = "./legacy/gserver/tests/concat_fullmatrix_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/concat_fullmatrix_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+TEST(Compare, concat_table) {
+  std::string config_file_a = "./legacy/gserver/tests/concat_table_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/concat_table_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+TEST(Compare, concat_slice) {
+  std::string config_file_a = "./legacy/gserver/tests/concat_slice_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/concat_slice_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(Compare, img_pool) {
+  std::string config_file_a = "./legacy/gserver/tests/img_pool_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/img_pool_b.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
+
+TEST(Compare, img_conv) {
+  std::string config_file_a = "./legacy/gserver/tests/img_conv_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/img_conv_b.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
+
+// Test cudnn_conv and exconv give the same result
+TEST(Compare, img_conv2) {
+  std::string config_file_a = "./legacy/gserver/tests/img_conv_cudnn.py";
+  std::string config_file_b = "./legacy/gserver/tests/img_conv_exconv.py";
+  bool useGpu = FLAGS_use_gpu;
+  double eps = FLAGS_checkgrad_eps;
+  FLAGS_use_gpu = true;
+  // Sometimes, this unit test will fail with 1e-2
+  FLAGS_checkgrad_eps = 4e-2;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+  FLAGS_checkgrad_eps = eps;
+}
+#endif
+
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
+TEST(Compare, network) {
+  if (FLAGS_config_file_a != "" && FLAGS_config_file_b != "") {
+    compareNetwork(FLAGS_config_file_a, FLAGS_config_file_b);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  initPython(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
diff --git a/paddle/gserver/tests/test_PriorBox.cpp b/paddle/legacy/gserver/tests/test_PriorBox.cpp
similarity index 100%
rename from paddle/gserver/tests/test_PriorBox.cpp
rename to paddle/legacy/gserver/tests/test_PriorBox.cpp
diff --git a/paddle/legacy/gserver/tests/test_PyDataProvider.cpp b/paddle/legacy/gserver/tests/test_PyDataProvider.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9cde4ecca52957a6de30bb37a497d4af162d804c
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_PyDataProvider.cpp
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "paddle/legacy/gserver/dataproviders/PyDataProvider.h"
+#include "paddle/utils/Util.h"
+
+#include "paddle/testing/TestUtil.h"
+
+using namespace std;     // NOLINT
+using namespace paddle;  // NOLINT
+
+void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu);
+void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num);
+
+TEST(PyDataProvider, py_fill_slots) {
+  DataConfig config;
+  config.set_type("py");
+  config.set_async_load_data(false);
+  config.set_load_data_module(std::string("pyDataProvider"));
+  config.set_load_data_object(std::string("SimpleDataProvider"));
+  config.clear_files();
+  std::string dataFile =
+      "legacy/gserver/tests/pyDataProvider/pyDataProviderList";
+  config.set_files(dataFile);
+#ifndef PADDLE_WITH_CUDA
+  bool useGpu = false;
+#else
+  bool useGpu = true;
+#endif
+  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
+  DataBatch dataBatch;
+  dataProvider->getNextBatchInternal(2, &dataBatch);
+  const std::vector<Argument>& argumentList = dataBatch.getStreams();
+  // Check size
+  EXPECT_EQ(argumentList.size(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getHeight(), 2UL);
+  EXPECT_EQ(argumentList[0].value->getElementCnt(), 6UL);
+  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
+  EXPECT_EQ(argumentList[1].value->getHeight(), 2UL);
+  EXPECT_EQ(argumentList[1].value->getElementCnt(), 4UL);
+  EXPECT_EQ(argumentList[2].ids->getSize(), 2UL);
+  // Check value
+  simpleValueCheck(argumentList, useGpu);
+  // Check sequenceStartPositions
+  simpleSequenceCheck(argumentList, 2);
+}
+
+TEST(PyDataProvider, py_fill_nest_slots) {
+  DataConfig config;
+  config.set_type("py");
+  config.set_async_load_data(false);
+  config.set_load_data_module(std::string("pyDataProvider"));
+  config.set_load_data_object(std::string("SimpleNestDataProvider"));
+  config.clear_files();
+  std::string dataFile =
+      "legacy/gserver/tests/pyDataProvider/pyDataProviderList";
+  config.set_files(dataFile);
+  EXPECT_EQ(config.IsInitialized(), true);
+#ifndef PADDLE_WITH_CUDA
+  bool useGpu = false;
+#else
+  bool useGpu = true;
+#endif
+  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
+  DataBatch dataBatch;
+  dataProvider->getNextBatchInternal(2, &dataBatch);
+  const std::vector<Argument>& argumentList = dataBatch.getStreams();
+  // Check size
+  EXPECT_EQ(argumentList.size(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getHeight(), 4UL);
+  EXPECT_EQ(argumentList[0].value->getElementCnt(), 12UL);
+  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
+  EXPECT_EQ(argumentList[1].value->getHeight(), 4UL);
+  EXPECT_EQ(argumentList[1].value->getElementCnt(), 8UL);
+  EXPECT_EQ(argumentList[2].ids->getSize(), 4UL);
+  // Check value
+  simpleValueCheck(argumentList, useGpu);
+  // Check sequenceStartPositions
+  simpleSequenceCheck(argumentList, 4);
+  // Check subSequenceStartPositions
+  EXPECT_EQ(argumentList[0].subSequenceStartPositions->getSize(), 4UL);
+  EXPECT_EQ(argumentList[1].subSequenceStartPositions->getSize(), 3UL);
+  EXPECT_EQ(argumentList[2].subSequenceStartPositions->getSize(), 4UL);
+  for (size_t i = 0; i < argumentList.size(); i++) {
+    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(0), 0);
+    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(1), 1);
+    if (i != 1) {
+      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 2);
+      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(3), 4);
+    } else {
+      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 4);
+    }
+  }
+}
+
+void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu) {
+  // Dense
+  real* data;
+  if (useGpu) {
+    MatrixPtr cpuMatrixPtr = Matrix::create(argumentList[0].value->getHeight(),
+                                            argumentList[0].value->getWidth(),
+                                            0,
+                                            0);
+    cpuMatrixPtr->copyFrom(*argumentList[0].value);
+    data = cpuMatrixPtr->getData();
+  } else {
+    data = argumentList[0].value->getData();
+  }
+  for (size_t i = 0; i < argumentList[0].value->getElementCnt(); ++i) {
+    EXPECT_EQ(*(data + i), (float)(i % 3 + 1));
+  }
+  // Sparse without value
+  GpuSparseMatrixPtr matGpu;
+  CpuSparseMatrixPtr matCpu;
+  if (useGpu) {
+    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(argumentList[1].value);
+    ASSERT_TRUE(matGpu != NULL);
+  } else {
+    data = argumentList[0].value->getData();
+    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(argumentList[1].value);
+    ASSERT_TRUE(matCpu != NULL);
+  }
+  for (size_t i = 0; i < argumentList[1].value->getHeight(); ++i) {
+    size_t colNum = useGpu ? matGpu->getColNum(i) : matCpu->getColNum(i);
+    EXPECT_EQ(colNum, (size_t)2);
+    const int* buf = useGpu ? matGpu->getRowCols(i) : matCpu->getRowCols(i);
+    for (size_t j = 0; j < colNum; ++j) {
+      EXPECT_EQ((size_t)buf[j], (size_t)(j + 1));
+    }
+  }
+  // Index
+  for (size_t j = 0; j < argumentList[2].ids->getSize(); ++j) {
+    EXPECT_EQ((size_t)argumentList[2].ids->get(j), 0UL);
+  }
+}
+
+void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num) {
+  EXPECT_EQ(argumentList[0].sequenceStartPositions->getSize(), 3UL);
+  EXPECT_EQ(argumentList[1].sequenceStartPositions->getSize(), 2UL);
+  EXPECT_EQ(argumentList[2].sequenceStartPositions->getSize(), 3UL);
+  for (size_t i = 0; i < argumentList.size(); i++) {
+    EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(0), 0);
+    if (i != 1) {
+      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1), 1);
+      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(2),
+                sample_num);
+    } else {
+      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1),
+                sample_num);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp b/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f5a087b9abb967bf40022c079efe1fdf4bfb221
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
@@ -0,0 +1,409 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_NO_PYTHON
+#include <gtest/gtest.h>
+#include <fstream>
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Util.h"
+
+DEFINE_string(train_list, "unittest.list", "file list for unittest");
+
+namespace paddle {
+namespace unittest {
+namespace pydp2 {
+extern void setOnPoolFilledHook(const std::function<void(size_t)> &func);
+extern void clearOnPoolFilledHook();
+
+}  // namespace pydp2
+}  // namespace unittest
+}  // namespace paddle
+
+const paddle::real epsilon = 1e-5;
+
+static inline int64_t readDataBatch(paddle::DataBatch *batch,
+                                    const std::string &funcName,
+                                    int64_t batchSize = 65535) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object(funcName);
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  return provider->getNextBatchInternal(batchSize, batch);
+}
+
+TEST(PyDataProvider2, dense_no_seq) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_dense_no_seq");
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+
+  provider->setSkipShuffle();  // skip shuffle for unittest.
+
+  paddle::DataBatch batch;
+  for (size_t pass = 0; pass < 2; ++pass) {  // read 2 passes
+    provider->reset();
+    int64_t num = provider->getNextBatchInternal(100, &batch);
+    ASSERT_NE(num, 0);
+    ASSERT_EQ((size_t)batch.getStreams().size(), (size_t)1);
+    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
+    // Check batch data.
+    for (size_t i = 0; i < 100; ++i) {
+      for (size_t j = 0; j < 200; ++j) {
+        paddle::real tmp = (paddle::real)((j - 100.0) * (i + 1) / 200.0);
+        ASSERT_NEAR(
+            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
+      }
+    }
+
+    num = provider->getNextBatchInternal(100, &batch);
+    ASSERT_NE(num, 0);
+    ASSERT_EQ(batch.getStreams().size(), (size_t)1);
+    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
+    // Check batch data.
+    for (size_t i = 0; i < 100; ++i) {
+      size_t ii = i + 100;
+      for (size_t j = 0; j < 200; ++j) {
+        paddle::real tmp = (paddle::real)((j - 100.0) * (ii + 1) / 200.0);
+        ASSERT_NEAR(
+            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
+      }
+    }
+    num = provider->getNextBatchInternal(100, &batch);
+    ASSERT_EQ(num, 0);
+  }
+}
+
+TEST(PyDataProvider2, index_no_seq) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_index_no_seq");
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+
+  provider->setSkipShuffle();  // skip shuffle for unittest.
+  paddle::DataBatch batch;
+  for (size_t pass = 0; pass < 2; ++pass) {
+    provider->reset();
+    int64_t num = provider->getNextBatchInternal(10000, &batch);
+    CHECK_EQ(num, 200);
+    for (int i = 0; i < 200; ++i) {
+      CHECK_EQ(i, batch.getStreams()[0].ids->getData()[i]);
+    }
+  }
+}
+
+TEST(PyDataProvider2, init_hook) {
+  paddle::PyObjectPtr pickle = paddle::py::import("pickle");
+  paddle::PyObjectPtr globals(PyModule_GetDict(PyImport_AddModule("__main__")));
+  PyDict_SetItemString(globals.get(), "pickle", pickle.get());
+  paddle::PyObjectPtr locals(PyDict_New());
+  paddle::PyObjectPtr mdl(PyRun_String(
+      "dumps = pickle.dumps({'value':[float(x) for x in xrange(20)]})",
+      Py_file_input,
+      globals.get(),
+      locals.get()));
+  CHECK_PY(mdl) << "Error!";
+  paddle::PyObjectPtr dps(PyDict_GetItemString(locals.get(), "dumps"));
+  CHECK_PY(dps) << "Error!";
+
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_init_hook");
+  config.set_load_data_args(PyString_AsString(dps.get()));
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->setSkipShuffle();  // skip shuffle for unittest.
+  provider->reset();
+  paddle::DataBatch batch;
+  int64_t num = provider->getNextBatchInternal(100000, &batch);
+  ASSERT_EQ(num, 200);
+  auto &mat = batch.getStreams()[0].value;
+  ASSERT_EQ((size_t)mat->getWidth(), (size_t)20);
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < 20; ++j) {
+      ASSERT_NEAR((paddle::real)j, mat->getData()[i * 20 + j], epsilon);
+    }
+  }
+}
+
+TEST(PyDataProvider2, sparse_no_value_no_seq) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_sparse_non_value_no_seq");
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  paddle::DataBatch batch;
+  int64_t num = provider->getNextBatchInternal(10000, &batch);
+  CHECK_EQ(num, 200);
+  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
+      batch.getStreams()[0].value);
+  CHECK(csm != nullptr);
+  for (int i = 0; i < 200; ++i) {
+    CHECK_EQ(csm->getColNum(i), (size_t)10);
+    int *cols = csm->getRowCols(i);
+    for (int j = 0; j < 10; ++j) {
+      CHECK_EQ(cols[j], (i + 1) * (j + 1));
+    }
+  }
+}
+
+TEST(PyDataProvider2, sparse_value_no_seq) {
+  paddle::DataBatch batch;
+  CHECK_EQ(readDataBatch(&batch, "test_sparse_value_no_seq"), 200);
+  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
+      batch.getStreams()[0].value);
+  CHECK(csm != nullptr);
+  for (int i = 0; i < 200; ++i) {
+    CHECK_EQ(csm->getColNum(i), (size_t)10);
+    int *cols = csm->getRowCols(i);
+    real *dat = csm->getRowValues(i);
+    for (int j = 0; j < 10; ++j) {
+      EXPECT_EQ(cols[j], (i + 1) * (j + 1));
+      EXPECT_EQ(dat[j], real(j) / real(i + 1));
+    }
+  }
+}
+
+TEST(PyDataProvider2, index_seq) {
+  paddle::DataBatch batch;
+  CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200);
+  auto &arg = batch.getStreams()[0];
+  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 / 2);
+  size_t tmp = 0;
+  for (size_t i = 0; i < 200; ++i) {  // CHECK DATA CORRECT
+    for (size_t j = 0; j < i + 1; ++j) {
+      ASSERT_EQ((size_t)arg.ids->getData()[tmp], j);
+      ++tmp;
+    }
+  }
+  ASSERT_EQ(arg.sequenceStartPositions->getSize(), (size_t)201);
+  tmp = 0;
+  for (size_t i = 0; i < 200; ++i) {
+    tmp += i;
+    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i], tmp);
+  }
+  tmp += 200;
+  ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[200], tmp);
+}
+
+TEST(PyDataProvider2, index_sub_seq) {
+  paddle::DataBatch batch;
+  ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200);
+  auto &arg = batch.getStreams()[0];
+  size_t tmp = 0;
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < i + 1; ++j) {
+      for (size_t k = 0; k < j + 1; ++k) {
+        CHECK_EQ((size_t)arg.ids->getData()[tmp++], k);
+      }
+    }
+  }
+
+  CHECK_EQ(tmp, arg.ids->getSize());
+
+  ASSERT_EQ((size_t)arg.sequenceStartPositions->getSize(), (size_t)201);
+  ASSERT_EQ(arg.subSequenceStartPositions->getData(false)[0], 0);
+  ASSERT_EQ(arg.sequenceStartPositions->getData(false)[0], 0);
+  size_t idx = 1;
+  tmp = 0;
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < i + 1; ++j) {
+      tmp += j + 1;
+      ASSERT_EQ((size_t)arg.subSequenceStartPositions->getData(false)[idx],
+                (size_t)tmp);
+      ++idx;
+    }
+    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i + 1], tmp);
+  }
+}
+
+TEST(PyDataProvider2, min_pool_size) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_min_pool_size");
+  config.set_load_data_args("");
+  size_t totalData = 1 << 14;
+  constexpr size_t batchSize = 100;
+  constexpr size_t minPoolSize = 1000;
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+
+  paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) {
+    if (totalData > batchSize) {
+      CHECK_GE(poolSize, std::min(totalData - batchSize, minPoolSize));
+    }
+  });
+  while (true) {
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (realBatchSize) {
+      totalData -= realBatchSize;
+    } else {
+      break;
+    }
+  }
+  paddle::unittest::pydp2::clearOnPoolFilledHook();
+}
+
+TEST(PyDataProvider2, can_over_batch_size) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_can_over_batch_size");
+  config.set_load_data_args("");
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+  constexpr size_t batchSize = 100;
+  while (true) {
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (realBatchSize) {
+      CHECK_LE(static_cast<size_t>(realBatchSize), batchSize);
+    } else {
+      break;
+    }
+  }
+}
+
+TEST(PyDataProvider2, input_order) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_input_order");
+  config.set_load_data_args("");
+
+  paddle::ModelConfig modelConfig;
+  *modelConfig.add_input_layer_names() = "input1";
+  *modelConfig.add_input_layer_names() = "input2";
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, modelConfig, false));
+  provider->reset();
+  constexpr size_t batchSize = 100;
+  while (true) {
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (!realBatchSize) {
+      break;
+    }
+    ASSERT_EQ(batch.getStreams().size(), static_cast<size_t>(2));
+    for (int64_t i = 0; i < realBatchSize; ++i) {
+      ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
+      ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
+    }
+  }
+}
+
+TEST(PyDataProvider2, test_check) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_check");
+  config.set_load_data_args("");
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+  while (true) {
+    int64_t realBatchSize = provider->getNextBatchInternal(100, &batch);
+    if (!realBatchSize) {
+      break;
+    } else {
+      auto &ivec = batch.getStream(0).ids;
+      for (size_t i = 0; i < ivec->getSize(); ++i) {
+        CHECK_LT(ivec->getData()[i], 10);
+      }
+    }
+  }
+}
+
+TEST(PyDataProvider2, multiThread) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_dense_no_seq");
+  config.set_async_load_data(true);
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+  paddle::DataBatch batch;
+  provider->getNextBatch(100, &batch);
+  provider->reset();
+  provider.reset();
+}
+
+TEST(PyDataProvider2, minPoolSizeWithCache) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_min_pool_size_with_cache");
+  config.set_async_load_data(true);
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+
+  paddle::DataBatch batch;
+
+  for (int i = 0; i < 10; ++i) {
+    provider->reset();
+    int64_t sum = 0;
+    while (int64_t actualNum = provider->getNextBatch(100, &batch)) {
+      sum += actualNum;
+    }
+    ASSERT_EQ(1 << 20, sum);
+  }
+}
+
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+
+  std::ofstream fout(FLAGS_train_list);
+  CHECK(fout.is_open());
+  fout << "stub file name" << std::endl;  // in unittest, filename is not used.
+  fout.close();
+
+  return RUN_ALL_TESTS();
+}
+
+#endif
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/legacy/gserver/tests/test_PyDataProvider2.py
similarity index 100%
rename from paddle/gserver/tests/test_PyDataProvider2.py
rename to paddle/legacy/gserver/tests/test_PyDataProvider2.py
diff --git a/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9f9fee7ef6c83d3bb53bf5725dd6a3725d6c7c93
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/gserver/gradientmachines/GradientMachine.h>
+#include <paddle/legacy/parameter/ParameterUpdateFunctions.h>
+#include <paddle/trainer/Trainer.h>
+#include <paddle/trainer/TrainerInternal.h>
+#include <paddle/utils/PythonUtil.h>
+#include <paddle/utils/Util.h>
+#include <paddle/utils/Version.h>
+
+DECLARE_int32(seed);
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+class TrainerForTest : public paddle::Trainer {
+ public:
+  void startTrain() {
+    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
+    gm.start();
+  }
+
+  void finishTrain() {
+    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
+    gm.finish();
+  }
+
+  /**
+   * Get total dimension of all parameters.
+   *
+   * @return the total dimension of all parameters
+   */
+  size_t getTotalParameterSize() const {
+    auto p = const_cast<TrainerForTest*>(this);
+    auto& params = p->getGradientMachine()->getParameters();
+    return std::accumulate(
+        params.begin(), params.end(), 0UL, [](size_t a, const ParameterPtr& p) {
+          return a + p->getSize();
+        });
+  }
+};
+
+void CalCost(const string& conf,
+             const string& dir,
+             real* cost,
+             int num_passes) {
+  auto config = std::make_shared<TrainerConfigHelper>(conf);
+  TrainerForTest trainer;
+  trainer.init(config);
+  mkDir(dir.c_str());
+  config->setSaveDir(dir);
+  auto dataProvider = trainer.getDataProvider();
+  int32_t batchSize = config->getOptConfig().batch_size();
+  real learningRate = config->getOptConfig().learning_rate();
+  real momentum = 0;
+  real decayRate = 0;
+  int64_t dim = trainer.getTotalParameterSize();
+  CpuVector vecW(dim);
+  CpuVector vecGradient(dim);
+  CpuVector vecMomentum(dim);
+
+  // vecW needs to be assigned, otherwise the variable is an uncertain value.
+
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  vecW.randnorm(0, 0.1);
+  vecMomentum.randnorm(0, 0.1);
+
+  trainer.startTrain();
+  for (int i = 0; i < num_passes; ++i) {
+    real totalCost = 0;
+    dataProvider->reset();
+    while (true) {
+      DataBatch dataBatch;
+      int num = dataProvider->getNextBatch(batchSize, &dataBatch);
+      if (num == 0) break;
+      totalCost += trainer.calcGradient(dataBatch, vecW, vecGradient);
+      sgdUpdate(
+          learningRate, momentum, decayRate, &vecW, &vecGradient, &vecMomentum);
+    }
+    cost[i] = totalCost;
+  }
+  trainer.finishTrain();
+  rmDir(dir.c_str());
+}
+
+void test(const string& conf1, const string& conf2, double eps, bool useGpu) {
+  if (!paddle::version::isWithGpu() && useGpu) {
+    return;
+  }
+  FLAGS_use_gpu = useGpu;
+  int num_passes = 5;
+  real* cost1 = new real[num_passes];
+  const string dir1 = "legacy/gserver/tests/t1";
+  CalCost(conf1, dir1, cost1, num_passes);
+
+  real* cost2 = new real[num_passes];
+  const string dir2 = "legacy/gserver/tests/t2";
+  CalCost(conf2, dir2, cost2, num_passes);
+
+  for (int i = 0; i < num_passes; i++) {
+    LOG(INFO) << "num_passes: " << i << ", cost1=" << cost1[i]
+              << ", cost2=" << cost2[i]
+              << ", diff=" << std::abs(cost1[i] - cost2[i]);
+    ASSERT_NEAR(cost1[i], cost2[i], eps);
+  }
+  delete[] cost1;
+  delete[] cost2;
+}
+
+TEST(RecurrentGradientMachine, HasSubSequence) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_layer_group.conf",
+         "legacy/gserver/tests/sequence_nest_layer_group.conf",
+         1e-5,
+         useGpu);
+  }
+}
+
+TEST(RecurrentGradientMachine, rnn) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_rnn.conf",
+         "legacy/gserver/tests/sequence_nest_rnn.conf",
+         1e-6,
+         useGpu);
+  }
+}
+
+TEST(RecurrentGradientMachine, rnn_multi_input) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_rnn_multi_input.conf",
+         "legacy/gserver/tests/sequence_nest_rnn_multi_input.conf",
+         1e-6,
+         useGpu);
+  }
+}
+
+TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py",
+         "legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py",
+         1e-6,
+         useGpu);
+  }
+}
+
+TEST(RecurrentGradientMachine, rnn_mixed_input) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_rnn_mixed_inputs.py",
+         "legacy/gserver/tests/sequence_rnn_matched_inputs.py",
+         1e-6,
+         useGpu);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  if (paddle::version::isWithPyDataProvider()) {
+    if (!paddle::version::isWithGpu()) {
+      FLAGS_use_gpu = false;
+    }
+    initMain(argc, argv);
+    initPython(argc, argv);
+    return RUN_ALL_TESTS();
+  } else {
+    return 0;
+  }
+}
diff --git a/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp b/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..852a08d49343766f9e5a2c4ff8318586262ca1a2
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
@@ -0,0 +1,571 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/utils/Version.h>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+DECLARE_bool(use_gpu);
+DECLARE_bool(rnn_use_batch);
+DECLARE_int32(fixed_seq_length);
+
+void checkError(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void checkError(const CpuVector& vector1, const CpuVector& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int size = vector1.getSize();
+  const real* data1 = vector1.getData();
+  const real* data2 = vector2.getData();
+  int count = 0;
+  for (int i = 0; i < size; i++) {
+    if (fabs(data1[i] - data2[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+LayerPtr creatDataLayer(string name,
+                        size_t batchSize,
+                        int layerSize,
+                        bool useGpu) {
+  LayerConfig dataConfig;
+  dataConfig.set_name(name);
+  dataConfig.set_type("data");
+  dataConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
+
+  Argument data;
+  data.value = Matrix::create(batchSize, layer->getSize(), false, useGpu);
+  data.grad = Matrix::create(batchSize, layer->getSize(), false, useGpu);
+  data.value->randomizeUniform();
+  data.value->add(-0.5);
+  data.value->sigmoid(*data.value);
+  data.grad->zeroMem();
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+ParameterPtr creatParameter(string name,
+                            int pid,
+                            size_t paraSize,
+                            bool useGpu) {
+  ParameterConfig paraConfig;
+  paraConfig.set_name(name);
+  paraConfig.set_size(paraSize);
+
+  ParameterPtr parameter =
+      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
+  parameter->enableType(PARAMETER_VALUE);
+  parameter->enableType(PARAMETER_GRADIENT);
+  parameter->randomize();
+  parameter->setID(pid);
+
+  return parameter;
+}
+
+ParameterPtr creatParameterBias(string name,
+                                int pid,
+                                size_t paraSize,
+                                bool useGpu) {
+  ParameterConfig paraConfig;
+  paraConfig.set_name(name);
+  paraConfig.set_size(paraSize);
+  paraConfig.set_initial_std(1);
+
+  ParameterPtr parameter =
+      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ true);
+  parameter->randomize();
+  parameter->setID(pid);
+
+  return parameter;
+}
+
+LayerPtr initRecurrentLayer(LayerConfig layerConfig,
+                            size_t batchSize,
+                            int layerSize,
+                            bool useGpu) {
+  FLAGS_use_gpu = useGpu;
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+  LayerPtr dataLayer = creatDataLayer("layer_0", batchSize, layerSize, useGpu);
+  layerMap[dataLayer->getName()] = dataLayer;
+
+  ParameterPtr para =
+      creatParameter("para_0", 0, layerSize * layerSize, useGpu);
+  parameterMap[para->getName()] = para;
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->init(layerMap, parameterMap);
+  testLayer->setNeedGradient(true);
+
+  return testLayer;
+}
+
+void checkRecurrentLayer(LayerPtr testLayer) {
+  const VectorPtr& weightGrad =
+      (testLayer->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
+  const MatrixPtr& inputGrad = testLayer->getPrev(0)->getOutputGrad();
+  CpuVector seqPara(weightGrad->getSize());
+  CpuVector batPara(weightGrad->getSize());
+  CpuMatrix seqInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
+  CpuMatrix batInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
+
+  CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth());
+  outputGrad.randomizeUniform();
+
+  /* use sequence calculate */
+  FLAGS_rnn_use_batch = false;
+  weightGrad->zero();
+  inputGrad->zero();
+  testLayer->forward(PASS_GC);
+  testLayer->getOutputGrad()->copyFrom(outputGrad);
+  testLayer->backward();
+  seqPara.copyFrom(*weightGrad);
+  seqInputGrad.copyFrom(*inputGrad);
+
+  /* use batch calculate */
+  FLAGS_rnn_use_batch = true;
+  weightGrad->zero();
+  inputGrad->zero();
+  testLayer->forward(PASS_GC);
+  testLayer->getOutputGrad()->copyFrom(outputGrad);
+  testLayer->backward();
+  batPara.copyFrom(*weightGrad);
+  batInputGrad.copyFrom(*inputGrad);
+
+  /* check */
+  checkError(seqInputGrad, batInputGrad);
+  checkError(seqPara, batPara);
+}
+
+TEST(Layer, RecurrentLayer) {
+  LayerConfig layerConfig;
+  layerConfig.set_name("rnn");
+  layerConfig.set_type("recurrent");
+  layerConfig.set_active_type("tanh");
+  for (auto layerSize : {1, 10, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 20, 100, 128}) {
+      for (auto useGpu : {false, true}) {
+        for (auto reversed : {false, true}) {
+          LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
+                    << " useGpu=" << useGpu << " reversed=" << reversed;
+          layerConfig.set_size(layerSize);
+          layerConfig.set_reversed(reversed);
+          LayerPtr testLayer =
+              initRecurrentLayer(layerConfig, batchSize, layerSize, useGpu);
+          checkRecurrentLayer(testLayer);
+        }
+      }
+    }
+  }
+}
+
+#define protected public
+#include "paddle/legacy/gserver/layers/GatedRecurrentLayer.h"
+#include "paddle/legacy/gserver/layers/LstmLayer.h"
+#include "paddle/legacy/gserver/layers/RecurrentLayer.h"
+template <class T>
+class TestRecurrentLayer {
+ public:
+  LayerConfig config_;
+  bool useGpu_;
+  bool useBatch_;
+  LayerPtr testLayer_;
+  LayerPtr dataLayer_;
+  ParameterPtr para_;
+  ParameterPtr bias_;
+  LayerMap layerMap_;
+  ParameterMap parameterMap_;
+  TestRecurrentLayer(const LayerConfig& config,
+                     bool useGpu,
+                     bool useBatch = false)
+      : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
+  void init(size_t batchSize) {
+    FLAGS_use_gpu = useGpu_;
+    testLayer_ = Layer::create(config_);
+    if (typeid(T) == typeid(GatedRecurrentLayer)) {
+      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
+                                  batchSize,
+                                  config_.size() * 3,
+                                  useGpu_);
+      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
+                             0,
+                             config_.size() * config_.size() * 3,
+                             useGpu_);
+      bias_ = creatParameterBias(
+          config_.bias_parameter_name(), 1, config_.size() * 3, useGpu_);
+    } else if (typeid(T) == typeid(LstmLayer)) {
+      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
+                                  batchSize,
+                                  config_.size() * 4,
+                                  useGpu_);
+      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
+                             0,
+                             config_.size() * config_.size() * 4,
+                             useGpu_);
+      bias_ = creatParameterBias(
+          config_.bias_parameter_name(), 1, config_.size() * 7, useGpu_);
+    }
+    layerMap_[dataLayer_->getName()] = dataLayer_;
+    parameterMap_[para_->getName()] = para_;
+    parameterMap_[bias_->getName()] = bias_;
+
+    layerMap_[testLayer_->getName()] = testLayer_;
+    testLayer_->init(layerMap_, parameterMap_);
+    testLayer_->setNeedGradient(true);
+    (dynamic_cast<T*>(testLayer_.get()))->useBatch_ = useBatch_;
+  }
+  void forward() {
+    FLAGS_use_gpu = useGpu_;
+    testLayer_->forward(PASS_GC);
+  }
+  void backward() {
+    FLAGS_use_gpu = useGpu_;
+    testLayer_->backward(nullptr);
+  }
+};
+
+template <class T>
+void checkRecurrentLayer(LayerConfig layerConfig,
+                         size_t batchSize,
+                         bool cpuBatch,
+                         bool gpuBatch) {
+  TestRecurrentLayer<T> testCpu(layerConfig, false, cpuBatch);
+  TestRecurrentLayer<T> testGpu(layerConfig, true, gpuBatch);
+  testCpu.init(batchSize);
+  testGpu.init(batchSize);
+  auto checkError = [](
+      MatrixPtr cpu, MatrixPtr gpu, int numSequences, const char* str) {
+    CpuMatrix check(gpu->getHeight(), gpu->getWidth());
+    check.copyFrom(*gpu);
+    int height = cpu->getHeight();
+    int width = cpu->getWidth();
+    const real* data1 = cpu->getData();
+    const real* data2 = check.getData();
+    int count = 0;
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        if (fabs(data1[i * width + j] - data2[i * width + j]) / numSequences >
+            1e-4) {
+          count++;
+        }
+      }
+    }
+    EXPECT_EQ(count, 0) << "[" << str << "]"
+                        << "There are " << count << " different element.";
+  };
+  T* cpuLayer = dynamic_cast<T*>(testCpu.testLayer_.get());
+  T* gpuLayer = dynamic_cast<T*>(testGpu.testLayer_.get());
+
+  Argument& cpuInput = testCpu.dataLayer_->getOutput();
+  Argument& gpuInput = testGpu.dataLayer_->getOutput();
+  gpuInput.resizeAndCopyFrom(cpuInput, true);
+
+  const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
+  const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);
+  gpuVec->copyFrom(*cpuVec);
+
+  const VectorPtr& cpuBiasVec = testCpu.bias_->getBuf(PARAMETER_VALUE);
+  const VectorPtr& gpuBiasVec = testGpu.bias_->getBuf(PARAMETER_VALUE);
+  gpuBiasVec->copyFrom(*cpuBiasVec);
+
+  /* check forward */
+  testCpu.forward();
+  testGpu.forward();
+
+  checkError(
+      cpuLayer->getOutputValue(), gpuLayer->getOutputValue(), 1, "outputValue");
+
+  /* check backward */
+  cpuLayer->getOutputGrad()->randomizeUniform();
+  gpuLayer->getOutputGrad()->copyFrom(*cpuLayer->getOutputGrad());
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+
+  testCpu.backward();
+  testGpu.backward();
+
+  // check input grad
+  checkError(cpuInput.grad, gpuInput.grad, 1, "inputGrad");
+  // check weight grad
+  int numSequences = cpuInput.getNumSequences();
+  checkError(cpuLayer->weight_->getWGrad(),
+             gpuLayer->weight_->getWGrad(),
+             numSequences,
+             "weightGrad");
+  // check bias grad
+  checkError(cpuLayer->bias_->getWGrad(),
+             gpuLayer->bias_->getWGrad(),
+             numSequences,
+             "biasGrad");
+}
+
+TEST(Layer, GatedRecurrentLayer) {
+  LayerConfig layerConfig;
+  layerConfig.set_type("gated_recurrent");
+  layerConfig.set_active_type("sigmoid");
+  layerConfig.set_active_gate_type("sigmoid");
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+  layerConfig.set_bias_parameter_name("bias");
+
+  for (auto frameSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {false, true}) {
+        for (auto cpuBatch : {false, true}) {
+          for (auto gpuBatch : {false, true}) {
+            LOG(INFO) << " batchSize=" << batchSize
+                      << " frameSize=" << frameSize << " reversed=" << reversed
+                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
+            layerConfig.set_size(frameSize);
+            layerConfig.set_reversed(reversed);
+            checkRecurrentLayer<GatedRecurrentLayer>(
+                layerConfig, batchSize, cpuBatch, gpuBatch);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Layer, LstmLayer) {
+  LayerConfig layerConfig;
+  layerConfig.set_type("lstmemory");
+  layerConfig.set_active_type("relu");
+  layerConfig.set_active_state_type("tanh");
+  layerConfig.set_active_gate_type("sigmoid");
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+  layerConfig.set_bias_parameter_name("bias");
+
+  for (auto frameSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {false, true}) {
+        for (auto cpuBatch : {false, true}) {
+          for (auto gpuBatch : {false, true}) {
+            LOG(INFO) << " batchSize=" << batchSize
+                      << " frameSize=" << frameSize << " reversed=" << reversed
+                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
+            layerConfig.set_size(frameSize);
+            layerConfig.set_reversed(reversed);
+            checkRecurrentLayer<LstmLayer>(
+                layerConfig, batchSize, cpuBatch, gpuBatch);
+          }
+        }
+      }
+    }
+  }
+}
+
+#ifdef PADDLE_WITH_MKLML
+
+#include "paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h"
+
+LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
+                            bool reversed,
+                            int layerSize,
+                            LayerPtr dataLayer,
+                            ParameterPtr para,
+                            ParameterPtr bias = nullptr) {
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  parameterMap[para->getName()] = para;
+  if (bias) {
+    parameterMap[bias->getName()] = bias;
+    layerConfig.set_bias_parameter_name("bias_0");
+  }
+
+  layerConfig.set_size(layerSize);
+  layerConfig.set_reversed(reversed);
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->init(layerMap, parameterMap);
+  testLayer->setNeedGradient(true);
+
+  return testLayer;
+}
+
+void checkMKLPackedLayer(LayerConfig layerConfig1,
+                         LayerConfig layerConfig2,
+                         bool reversed,
+                         int layerSize,
+                         int batchSize,
+                         bool useBatch1,
+                         bool useBatch2) {
+  LayerPtr dataLayer;
+  ParameterPtr para, bias;
+
+  if (layerConfig1.type() == "recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize, false);
+    bias = nullptr;
+  } else if (layerConfig1.type() == "gated_recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize * 3, false);
+    bias = creatParameterBias("bias_0", 1, layerSize * 3, false);
+  }
+
+  LayerPtr testLayer1 = initMKLPackedLayer(
+      layerConfig1, reversed, layerSize, dataLayer, para, bias);
+  LayerPtr testLayer2 = initMKLPackedLayer(
+      layerConfig2, reversed, layerSize, dataLayer, para, bias);
+
+  const VectorPtr& weightGrad =
+      (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
+  const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
+  CpuVector wgt_grad1(weightGrad->getSize());
+  CpuVector wgt_grad2(weightGrad->getSize());
+  CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
+  CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
+
+  for (int i = 0; i < 2; i++) {
+    FLAGS_rnn_use_batch = useBatch1;
+
+    testLayer1->forward(PASS_GC);
+
+    FLAGS_rnn_use_batch = useBatch2;
+    testLayer2->forward(PASS_GC);
+
+    testLayer1->getOutputGrad()->randomizeUniform();
+    testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad());
+
+    weightGrad->zero();
+    inputGrad->zero();
+    FLAGS_rnn_use_batch = useBatch1;
+    testLayer1->backward(nullptr);
+
+    wgt_grad1.copyFrom(*weightGrad);
+    input_grad1.copyFrom(*inputGrad);
+
+    weightGrad->zero();
+    inputGrad->zero();
+    FLAGS_rnn_use_batch = useBatch2;
+    testLayer2->backward(nullptr);
+
+    wgt_grad2.copyFrom(*weightGrad);
+    input_grad2.copyFrom(*inputGrad);
+
+    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
+    checkError(wgt_grad1, wgt_grad2);
+    checkError(input_grad1, input_grad2);
+  }
+}
+
+TEST(MKLPackedLayer, RecurrentLayer) {
+  LayerConfig layerConfig1;
+  LayerConfig layerConfig2;
+
+  layerConfig1.set_name("paddle-rnn");
+  layerConfig1.set_type("recurrent");
+  layerConfig1.set_active_type("relu");
+
+  layerConfig2.set_name("mkl-packed-rnn");
+  layerConfig2.set_type("mkl_packed_recurrent");
+  layerConfig2.set_active_type("relu");
+
+  FLAGS_use_gpu = false;
+
+  for (auto layerSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {true, false}) {
+        for (auto paddle_use_batch : {true, false}) {
+          for (auto MKLPacked_use_batch : {true, false}) {
+            LOG(INFO) << " layerSize=" << layerSize
+                      << " batchSize=" << batchSize << " reversed=" << reversed
+                      << " paddle_use_batch=" << paddle_use_batch
+                      << " MKLPacked_use_batch=" << MKLPacked_use_batch;
+
+            checkMKLPackedLayer(layerConfig1,
+                                layerConfig2,
+                                reversed,
+                                layerSize,
+                                batchSize,
+                                paddle_use_batch,
+                                MKLPacked_use_batch);
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  if (!version::isWithGpu()) {
+    testing::GTEST_FLAG(filter) = "-Layer.*";
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..160d95f15833ece4a59f7a0a912593938ad92218
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
@@ -0,0 +1,471 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include <paddle/utils/PythonUtil.h>
+#include <algorithm>
+#include <cstdlib>
+#include <ctime>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/FullyConnectedLayer.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+#include "paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h"
+#include "paddle/legacy/math/CpuSparseMatrix.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(num_passes);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_string(config_args);
+
+size_t fcLayerWidth = 1024;
+
+struct ComData {
+  vector<Argument> outArgs;
+  vector<ParameterPtr> parameters;
+};
+
+int randint(int* data, size_t int_max, size_t size) {
+  srand((size_t)(time(NULL)));
+  if (int_max < size) {
+    return -1;
+  }
+  size_t count = 0;
+  std::map<int, int> tmp;
+  int this_int = 0;
+
+  while (count < size) {
+    this_int = std::rand() % int_max;  // NOLINT
+    if (tmp.find(this_int) == tmp.end()) {
+      tmp[this_int] = 0;
+      count += 1;
+    }
+  }
+
+  if (tmp.size() != size) {
+    return -1;
+  }
+  count = 0;
+  for (auto itr = tmp.begin(); itr != tmp.end(); ++itr) {
+    data[count] = itr->first;
+    count += 1;
+  }
+  return 0;
+}
+
+void calcOutput(ComData& comData,
+                const string configFile,
+                const string configArgs,
+                bool useGpu) {
+  FLAGS_config = configFile;
+  FLAGS_config_args = configArgs;
+  FLAGS_use_gpu = useGpu;
+  FLAGS_init_model_path = "legacy/gserver/tests/SelectiveFcTest/model";
+  *ThreadLocalRand::getSeed() = 0;
+  srand(0);
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlags(), false);
+
+  comData.parameters = trainer.getGradientMachine()->getParameters();
+
+  auto dataProvider = trainer.getDataProvider();
+  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
+  DataBatch dataBatch;
+  dataProvider->setSkipShuffle();
+  dataProvider->reset();
+  dataProvider->getNextBatch(batchSize, &dataBatch);
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+
+  vector<Argument>& inArgs = dataBatch.getStreams();
+  trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
+  trainer.getGradientMachine()->forwardBackward(
+      inArgs, &comData.outArgs, PASS_TRAIN);
+  trainer.getGradientMachine()->finish();
+}
+
+void checkMatrix(real* A, real* B, size_t matSize) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  int diffNum = 0;
+  for (size_t i = 0; i < matSize; ++i) {
+    if (std::isinf(A[i]) || std::isnan(A[i]) || std::isinf(B[i]) ||
+        std::isnan(B[i])) {
+    } else if (fabs(A[i] - B[i]) > err) {
+      diffNum++;
+    }
+  }
+  EXPECT_EQ(0, diffNum);
+}
+
+void checkTranspose(real* matrix,
+                    real* transpose,
+                    size_t width,
+                    size_t matSize) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  size_t height = matSize / width;
+  int diffNum = 0;
+  size_t rowId = 0;
+  size_t colId = 0;
+  for (size_t i = 0; i < matSize; ++i) {
+    if (i % width == 0 && i) {
+      rowId++;
+    }
+    colId = i % width;
+    if (fabs(matrix[i] - transpose[colId * height + rowId]) > err) {
+      diffNum++;
+      LOG(INFO) << i << " diff : " << matrix[i] << "\t"
+                << transpose[colId * height + rowId];
+    }
+  }
+  EXPECT_EQ(0, diffNum);
+}
+
+void compareOutput(ComData& fcData, ComData& selFcData) {
+  vector<Argument> outArgsFc = fcData.outArgs;
+  vector<Argument> outArgsSelfc = selFcData.outArgs;
+
+  // check cost
+  LOG(INFO) << "Check cost";
+  CpuMatrix fcCost(outArgsFc[0].value->getHeight(),
+                   outArgsFc[0].value->getWidth());
+  CpuMatrix selfcCost(outArgsSelfc[0].value->getHeight(),
+                      outArgsSelfc[0].value->getWidth());
+  fcCost.copyFrom(*outArgsFc[0].value);
+  selfcCost.copyFrom(*outArgsSelfc[0].value);
+  checkMatrix(fcCost.getData(), selfcCost.getData(), fcCost.getElementCnt());
+
+  // check selective fc output and fc output
+  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer "
+            << "with FullyConectedLayer";
+  CpuMatrix fcOut(outArgsFc[1].value->getHeight(),
+                  outArgsFc[1].value->getWidth());
+  CpuMatrix selfcOut(outArgsSelfc[1].value->getHeight(),
+                     outArgsSelfc[1].value->getWidth());
+
+  fcOut.copyFrom(*outArgsFc[1].value);
+  selfcOut.copyFrom(*outArgsSelfc[1].value);
+  checkMatrix(fcOut.getData(), selfcOut.getData(), fcOut.getElementCnt());
+
+  // check gradient math
+  vector<ParameterPtr>& fcParam = fcData.parameters;
+  vector<ParameterPtr>& selfcParam = selFcData.parameters;
+  for (size_t i = 0; i < fcParam.size(); ++i) {
+    ParameterPtr p1, p2;
+    p1 = fcParam[i];
+    p2 = selfcParam[i];
+
+    string paramName = p1->getName();
+    LOG(INFO) << "check parameter : " << paramName;
+
+    // check parameter value
+    CpuVector paraValue1(p1->getSize());
+    CpuVector paraValue2(p2->getSize());
+    paraValue1.copyFrom(*p1->getBuf(PARAMETER_VALUE));
+    paraValue2.copyFrom(*p2->getBuf(PARAMETER_VALUE));
+
+    // check gradient
+    CpuVector paraGrad1(*p1->getBuf(PARAMETER_GRADIENT));
+    CpuVector paraGrad2(*p2->getBuf(PARAMETER_GRADIENT));
+    if (paramName == "rand_fc_param.bias") {
+      checkMatrix(
+          paraValue1.getData(), paraValue2.getData(), paraValue1.getSize());
+      checkMatrix(
+          paraGrad1.getData(), paraGrad2.getData(), paraGrad1.getSize());
+    } else {
+      checkTranspose(paraValue1.getData(),
+                     paraValue2.getData(),
+                     fcLayerWidth,
+                     paraValue1.getSize());
+      checkTranspose(paraGrad1.getData(),
+                     paraGrad2.getData(),
+                     fcLayerWidth,
+                     paraGrad1.getSize());
+    }
+  }
+}
+
+void compareSparseMulOutput(
+    real* fcOutput,
+    real* selOutput,
+    size_t nnz,
+    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& selCols) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  size_t nnzCount =
+      std::accumulate(selCols->begin(),
+                      selCols->end(),
+                      0UL,
+                      [](size_t a, const std::pair<int*, size_t>& arr) {
+                        return a + arr.second;
+                      });
+  EXPECT_EQ(nnz, nnzCount);
+
+  size_t sampleNum = selCols->size();
+  int diffNum = 0;
+  size_t count = 0;
+  for (size_t i = 0; i < sampleNum; ++i) {
+    for (size_t j = 0; j < (*selCols)[i].second; ++j) {
+      size_t selIdx = (*selCols)[i].first[j];
+      if (fabs(fcOutput[i * fcLayerWidth + selIdx] - selOutput[count]) > err) {
+        diffNum++;
+        LOG(INFO) << count << " diff : " << fcOutput[i * fcLayerWidth + selIdx]
+                  << "\t" << selOutput[count];
+      }
+      count++;
+    }
+  }
+  EXPECT_EQ(0, diffNum);
+}
+
+LayerPtr creatDataLayer(string name,
+                        size_t batchSize,
+                        size_t layerSize,
+                        std::vector<real>& values,
+                        bool useGpu) {
+  LayerConfig dataConfig;
+  dataConfig.set_name(name);
+  dataConfig.set_type("data");
+  dataConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
+
+  Argument data;
+  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.value->copyFrom(values.data(), batchSize * layerSize);
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_TEST);
+  return layer;
+}
+
+ParameterPtr creatParameter(
+    string name, int pid, size_t paraSize, string paramFile, bool useGpu) {
+  ParameterConfig paraConfig;
+  paraConfig.set_name(name);
+  paraConfig.set_size(paraSize);
+
+  ParameterPtr parameter =
+      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
+  parameter->enableType(PARAMETER_VALUE);
+  parameter->randomize();
+  parameter->setID(pid);
+  parameter->load(paramFile);
+  return parameter;
+}
+
+LayerPtr initFcLayer(LayerPtr dataLayer,
+                     LayerConfig layerConfig,
+                     int dataLayerSize,
+                     int fcLayerSize,
+                     string paraName,
+                     string paraFile,
+                     bool useGpu) {
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+
+  layerMap[dataLayer->getName()] = dataLayer;
+  ParameterPtr para = creatParameter(
+      paraName, 0, dataLayerSize * fcLayerSize, paraFile, useGpu);
+  parameterMap[para->getName()] = para;
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name(dataLayer->getName());
+  input.set_input_parameter_name(paraName);
+
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->setNeedGradient(false);
+  testLayer->init(layerMap, parameterMap);
+  return testLayer;
+}
+
+#ifndef PADDLE_TYPE_DOUBLE
+// The parameter file used in fc.conf and selective_fc.conf is float
+TEST(Layer, SelectiveFcLayer_train_dense_mul) {
+  const string& fcConfig = "legacy/gserver/tests/SelectiveFcTest/conf/fc.conf";
+  const string& fcConfigArgs =
+      "filelist=legacy/gserver/tests/SelectiveFcTest/dense_mul_list";
+  const string& selFcConfig =
+      "legacy/gserver/tests/SelectiveFcTest/conf/selective_fc.conf";
+  const string& selConfigArgs =
+      "filelist=legacy/gserver/tests/SelectiveFcTest/dense_mul_list";
+
+  for (auto useGpu : {false, true}) {
+#ifndef PADDLE_WITH_CUDA
+    if (useGpu) {
+      break;
+    }
+#endif
+    LOG(INFO) << "FullyConnectedLayer forwardBackward()";
+    ComData fcData;
+    calcOutput(fcData, fcConfig, fcConfigArgs, useGpu);
+
+    LOG(INFO) << "SelectiveFullyConnectedLayer forwardBackward()";
+    ComData selFcData;
+    calcOutput(selFcData, selFcConfig, selConfigArgs, useGpu);
+    compareOutput(fcData, selFcData);
+  }
+}
+#endif  // PADDLE_TYPE_DOUBLE
+
+void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
+                                        bool useGpu) {
+  FLAGS_use_gpu = useGpu;
+  size_t batchSize = 100;
+  size_t dataLayerSize = 512;
+  std::vector<real> values(batchSize * dataLayerSize);
+  for (size_t j = 0; j < batchSize * dataLayerSize; ++j) {
+    values[j] = std::rand() / real(RAND_MAX);
+  }
+  LayerPtr dataLayer =
+      creatDataLayer("data", batchSize, dataLayerSize, values, useGpu);
+
+  const string& selfcParaFile =
+      "legacy/gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
+  const string& selfcParaName = "rand_fc_param.w.transpose";
+
+  std::shared_ptr<SelectiveFullyConnectedLayer> selfcLayer =
+      std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(
+          initFcLayer(dataLayer,
+                      config,
+                      dataLayerSize,
+                      fcLayerWidth,
+                      selfcParaName,
+                      selfcParaFile,
+                      useGpu));
+
+  // create selected columns
+  std::shared_ptr<std::vector<std::pair<int*, size_t>>> selCols(
+      new std::vector<std::pair<int*, size_t>>(batchSize));
+  size_t maxNNZ = 30;
+  srand((size_t)(time(NULL)));
+  int total = 0;
+  while (total == 0) {
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t num = std::rand() % maxNNZ;
+      int* data = new int[num];
+      randint(data, fcLayerWidth, num);
+      (*selCols)[i] = std::make_pair(data, num);
+      total += num;
+    }
+  }
+  selfcLayer->fillSelectiveData(selCols);
+  selfcLayer->forward(PASS_TEST);
+
+  MatrixPtr outMatSelfc = selfcLayer->getOutputValue();
+  CpuSparseMatrixPtr cpuOutMatSelfc(
+      new CpuSparseMatrix(outMatSelfc->getHeight(),
+                          outMatSelfc->getWidth(),
+                          outMatSelfc->getElementCnt()));
+  cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
+#ifdef PADDLE_WITH_CUDA
+  if (useGpu) {
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  }
+#endif
+  real* outValueSelfc = cpuOutMatSelfc->getValue();
+  size_t nnz = cpuOutMatSelfc->getElementCnt();
+
+  const string& fcParaFile =
+      "legacy/gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
+  const string& fcParaName = "rand_fc_param.w";
+  LayerConfig fcLayerConfig;
+  fcLayerConfig.set_name("fc_layer");
+  fcLayerConfig.set_type("fc");
+  fcLayerConfig.set_active_type("linear");
+  fcLayerConfig.set_size(fcLayerWidth);
+
+  LayerPtr fcLayer = initFcLayer(dataLayer,
+                                 fcLayerConfig,
+                                 dataLayerSize,
+                                 fcLayerWidth,
+                                 fcParaName,
+                                 fcParaFile,
+                                 useGpu);
+  fcLayer->forward(PASS_TEST);
+
+  MatrixPtr outMatFc = fcLayer->getOutputValue();
+  MatrixPtr cpuOutMatFc(
+      new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
+  cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
+#ifdef PADDLE_WITH_CUDA
+  if (useGpu) {
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  }
+#endif
+  real* outValueFc = cpuOutMatFc->getData();
+
+  compareSparseMulOutput(outValueFc, outValueSelfc, nnz, selCols);
+  for (size_t i = 0; i < batchSize; ++i) {
+    delete[](*selCols)[i].first;
+  }
+}
+
+#ifndef PADDLE_TYPE_DOUBLE
+// The parameter file used in testSelectiveFcLayerTrainSparseMul is float
+TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
+  LayerConfig selLayerConfig;
+  selLayerConfig.set_name("sel_fc");
+  selLayerConfig.set_type("selective_fc");
+  selLayerConfig.set_active_type("linear");
+  selLayerConfig.set_has_selected_colums(false);
+  selLayerConfig.set_selective_fc_pass_generation(true);
+  selLayerConfig.set_size(fcLayerWidth);
+
+  testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
+#ifdef PADDLE_WITH_CUDA
+  testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
+#endif
+}
+#endif  // PADDLE_TYPE_DOUBLE
+
+// TODO(dangqingqing) test multi threads after support in matrix
+// TEST(Layer, SelectiveFcLayer_train_sparse_mul_parallel) {
+//   LayerConfig selLayerConfig;
+//   selLayerConfig.set_name("sel_fc");
+//   selLayerConfig.set_type("selective_fc");
+//   selLayerConfig.set_active_type("linear");
+//   selLayerConfig.set_has_selected_colums(false);
+//   selLayerConfig.set_selective_fc_pass_generation(true);
+//   selLayerConfig.set_selective_fc_parallel_plain_mul_thread_num(10);
+//   selLayerConfig.set_selective_fc_full_mul_ratio(1000);
+//   selLayerConfig.set_size(fcLayerWidth);
+//   SelectiveFcLayer_test(selLayerConfig, false);
+// }
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  initPython(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
diff --git a/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp b/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05acd714219fa5964b5b3595543682825ea67d84
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+const int MAX_SEQ_NUM = 17;
+const int MAX_SEQ_LEN = 23;
+const int MAX_BEAM_SIZE = 13;
+
+const size_t SEED = (size_t)(time(NULL));
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+void genSeqInfo(vector<int>& seqStartPos, vector<int>& subSeqStartPos) {
+  seqStartPos.resize(1, 0);
+  subSeqStartPos.resize(1, 0);
+
+  srand(SEED);
+  int seqNum = 1 + (rand() % MAX_SEQ_NUM);
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % MAX_SEQ_NUM);
+    for (int j = 0; j < subSeqNum; ++j)
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % MAX_SEQ_LEN)));
+    seqStartPos.push_back(subSeqStartPos.back());
+  }
+}
+
+/*
+  generate start indices according to sequence start positions.
+ */
+void genStarts(vector<int>& seqStartPos,
+               vector<vector<real>>& starts,
+               size_t beamSize) {
+  starts.clear();
+  starts.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
+
+  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
+    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+    vector<real> randStarts =
+        randSampling(seqLen, min(seqLen, static_cast<int>(beamSize)));
+    copy(begin(randStarts), end(randStarts), begin(starts[i]));
+  }
+}
+
+/*
+  generate end indices according to sequence start positions and start indices.
+ */
+void genEnds(vector<int>& seqStartPos,
+             vector<vector<real>>& starts,
+             vector<vector<real>>& ends,
+             size_t beamSize) {
+  CHECK_EQ(seqStartPos.size() - 1, starts.size());
+  ends.clear();
+  ends.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
+
+  for (size_t i = 0; i < starts.size(); ++i) {
+    for (size_t j = 0; j < starts[i].size(); ++j) {
+      int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+      CHECK_GE(seqLen - 1, starts[i][j]);
+      if (starts[i][j] == -1.) break;
+      if (starts[i][j] == (seqLen - 1)) {
+        ends[i][j] = starts[i][j];
+      } else {
+        ends[i][j] = starts[i][j] + randSampling(seqLen - starts[i][j], 1)[0];
+      }
+    }
+  }
+}
+
+void genTestData(vector<int>& seqStartPos,
+                 vector<int>& subSeqStartPos,
+                 vector<vector<real>>& starts,
+                 vector<vector<real>>& ends,
+                 bool hasSubseq) {
+  size_t beamSize = 1 + (rand() % MAX_BEAM_SIZE);
+  genSeqInfo(seqStartPos, subSeqStartPos);
+
+  genStarts(hasSubseq ? subSeqStartPos : seqStartPos, starts, beamSize);
+  genEnds(hasSubseq ? subSeqStartPos : seqStartPos, starts, ends, beamSize);
+}
+
+template <typename T>
+void flatten2dVector(vector<vector<T>>& inVec, vector<T>& outVec) {
+  size_t totalSize{0};
+  for (auto const& items : inVec) totalSize += items.size();
+  outVec.reserve(totalSize);
+
+  for (auto& items : inVec)
+    move(items.begin(), items.end(), back_inserter(outVec));
+}
+
+void testSeqSliceLayer(bool hasSubseq,
+                       bool useGpu,
+                       vector<int>& seqStartPos,
+                       vector<int>& subSeqStartPos,
+                       vector<vector<real>>& starts,
+                       vector<vector<real>>& ends) {
+  // layer size is not crutial for this layer,
+  // so here use a small layer size in the unittest.
+  const size_t layerSize{4};
+  TestConfig config;
+  config.layerConfig.set_type("seq_slice");
+  config.layerConfig.set_size(layerSize);
+
+  // add the first input
+  MatrixPtr seqInputPtr =
+      Matrix::create(hasSubseq ? subSeqStartPos.back() : seqStartPos.back(),
+                     layerSize,
+                     false,
+                     false);
+  seqInputPtr->randomizeUniform();
+
+  if (hasSubseq) {
+    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                "seq_input",
+                                seqInputPtr,
+                                seqStartPos,
+                                subSeqStartPos});
+  } else {
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, "seq_input", seqInputPtr, seqStartPos});
+  }
+  config.layerConfig.add_inputs();
+
+  // add start indices
+  if (starts.size()) {
+    vector<real> startsToVec;
+    flatten2dVector(starts, startsToVec);
+
+    MatrixPtr startMatrixPtr =
+        Matrix::create(starts.size(), starts[0].size(), false, false);
+    startMatrixPtr->copyFrom(startsToVec.data(), startsToVec.size());
+
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, "starts", startMatrixPtr});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_select_first(true);
+  }
+
+  // add end indices
+  if (ends.size()) {
+    vector<real> endsToVec;
+    flatten2dVector(ends, endsToVec);
+
+    MatrixPtr endMatrixPtr =
+        Matrix::create(ends.size(), ends[0].size(), false, false);
+    endMatrixPtr->copyFrom(endsToVec.data(), endsToVec.size());
+
+    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "ends", endMatrixPtr});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_select_first(false);
+  }
+
+  testLayerGrad(config, "seq_slice", /*batchSize*/ 100, false, useGpu, false);
+}
+
+TEST(Layer, SeqSliceLayer) {
+  vector<int> seqStartPos;
+  vector<int> subSeqStartPos;
+  vector<vector<real>> starts;
+  vector<vector<real>> ends;
+
+  std::vector<bool> mode = {false};
+#ifdef PADDLE_WITH_CUDA
+  mode.push_back(true);
+#endif
+  genSeqInfo(seqStartPos, subSeqStartPos);
+  for (bool hasSubseq : {true, false}) {
+    LOG(INFO) << "hasSubSeq : " << hasSubseq;
+    genTestData(seqStartPos, subSeqStartPos, starts, ends, hasSubseq);
+    for (bool useGpu : mode) {
+      vector<vector<real>> tmp;
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, tmp, ends);
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, tmp);
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, ends);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_Upsample.cpp b/paddle/legacy/gserver/tests/test_Upsample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..940d46baf73f2d600cff6edc37c29a3a36bf5d90
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_Upsample.cpp
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/testing/TestUtil.h"
+
+void setPoolConfig(paddle::TestConfig* config,
+                   paddle::PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(1);
+
+  int kw = 2, kh = 2;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(2);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow =
+      paddle::outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh =
+      paddle::outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+paddle::LayerPtr doOneUpsampleTest(const paddle::MatrixPtr& inputMat,
+                                   const string& poolType,
+                                   bool use_gpu,
+                                   real* tempGradData) {
+  /* prepare maxPoolWithMaskLayer */
+  paddle::TestConfig config;
+  config.inputDefs.push_back({paddle::INPUT_DATA, "layer_0", 128, 0});
+  paddle::LayerInputConfig* input = config.layerConfig.add_inputs();
+  paddle::PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(8);
+  pool->set_img_size_y(8);
+  setPoolConfig(&config, pool, "max-pool-with-mask");
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  config.layerConfig.set_name("MaxPoolWithMask");
+
+  std::vector<paddle::DataLayerPtr> dataLayers;
+  paddle::LayerMap layerMap;
+  vector<paddle::Argument> datas;
+
+  initDataLayer(config,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "MaxPoolWithMask",
+                1,
+                false,
+                use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
+
+  FLAGS_use_gpu = use_gpu;
+  std::vector<paddle::ParameterPtr> parameters;
+  paddle::LayerPtr maxPoolingWithMaskOutputLayer;
+  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
+  maxPoolingWithMaskOutputLayer->forward(paddle::PASS_GC);
+
+  /* prepare the upsample layer */
+  paddle::LayerConfig upsampleLayerConfig;
+  upsampleLayerConfig.set_type("upsample");
+  paddle::LayerInputConfig* input1 = upsampleLayerConfig.add_inputs();
+  upsampleLayerConfig.add_inputs();
+
+  paddle::UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf();
+  upsampleConfig->set_scale(2);
+  paddle::ImageConfig* imageConfig = upsampleConfig->mutable_image_conf();
+  imageConfig->set_channels(2);
+  imageConfig->set_img_size(4);
+  imageConfig->set_img_size_y(4);
+  upsampleLayerConfig.set_size(2 * 8 * 8);
+  upsampleLayerConfig.set_name("upsample");
+
+  for (size_t i = 0; i < 2; i++) {
+    paddle::LayerInputConfig& inputTemp =
+        *(upsampleLayerConfig.mutable_inputs(i));
+    inputTemp.set_input_layer_name("MaxPoolWithMask");
+  }
+
+  paddle::LayerPtr upsampleLayer;
+  paddle::ParameterMap parameterMap;
+  upsampleLayer = paddle::Layer::create(upsampleLayerConfig);
+  layerMap[upsampleLayerConfig.name()] = upsampleLayer;
+  upsampleLayer->init(layerMap, parameterMap);
+  upsampleLayer->setNeedGradient(true);
+  upsampleLayer->forward(paddle::PASS_GC);
+  upsampleLayer->getOutputGrad()->copyFrom(tempGradData, 128);
+  upsampleLayer->backward();
+
+  return upsampleLayer;
+}
+
+TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
+  bool useGpu = false;
+  paddle::MatrixPtr inputMat;
+  paddle::MatrixPtr inputGPUMat;
+  paddle::MatrixPtr tempGradMat;
+
+  inputMat = paddle::Matrix::create(1, 128, false, useGpu);
+  inputMat->randomizeUniform();
+
+  tempGradMat = paddle::Matrix::create(1, 128, false, useGpu);
+  tempGradMat->randomizeUniform();
+  real* tempGradData = tempGradMat->getData();
+
+  paddle::LayerPtr upsampleLayerCPU =
+      doOneUpsampleTest(inputMat, "max-pool-with-mask", useGpu, tempGradData);
+
+#ifdef PADDLE_WITH_CUDA
+  useGpu = true;
+  real* data = inputMat->getData();
+  inputGPUMat = paddle::Matrix::create(1, 128, false, useGpu);
+  inputGPUMat->copyFrom(data, 128);
+  paddle::LayerPtr upsampleLayerGPU = doOneUpsampleTest(
+      inputGPUMat, "max-pool-with-mask", useGpu, tempGradData);
+  paddle::checkMatrixEqual(upsampleLayerCPU->getOutput("").value,
+                           upsampleLayerGPU->getOutput("").value);
+
+  paddle::checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(),
+                           upsampleLayerGPU->getPrev(0)->getOutputGrad());
+#endif
+}
diff --git a/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp b/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..34b88e68930ad54cddc6faaaa1bee18875031624
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
@@ -0,0 +1,244 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/utils/Version.h>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/CTCLayer.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+#include "paddle/legacy/gserver/layers/WarpCTCLayer.h"
+
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+
+const real* getData(const Matrix& matrix) {
+  if (matrix.useGpu()) {
+    MatrixPtr cpuMatrix = Matrix::create(
+        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
+    cpuMatrix->copyFrom(matrix);
+    return cpuMatrix->getData();
+  } else {
+    return matrix.getData();
+  }
+}
+
+int checkError(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
+  CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
+  CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+
+  const real* data1 = getData(matrix1);
+  const real* data2 = getData(matrix2);
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+  return count;
+}
+
+void initArgument(size_t batchSize,
+                  int layerSize,
+                  bool useGpu,
+                  Argument& data) {
+  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.value->randomizeUniform();
+  data.value->add(-0.5);
+  data.grad->zeroMem();
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+}
+
+LayerPtr createDataLayer(
+    string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+LayerPtr createLabelLayer(string name,
+                          size_t batchSize,
+                          size_t numClasses,
+                          bool useGpu) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(1);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+
+  Argument data;
+  data.ids = IVector::create(batchSize, useGpu);
+  data.ids->rand(numClasses - 1);
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+
+  DataLayerPtr labelLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  labelLayer->setData(data);
+  labelLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+LayerPtr createCTCLayer(string name,
+                        size_t numClasses,
+                        bool useGpu,
+                        bool normByTimes,
+                        LayerPtr dataLayer,
+                        LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+
+  ParameterMap parameterMap;
+
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_norm_by_times(normByTimes);
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+
+  LayerPtr layer = LayerPtr(new CTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+
+  ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
+
+  softmaxActivation->forward(dataLayer->getOutput()).check();
+  layer->forward(PASS_GC);
+
+  layer->backward();
+  softmaxActivation->backward(dataLayer->getOutput()).check();
+
+  return layer;
+}
+
+LayerPtr createWarpCTCLayer(string name,
+                            size_t numClasses,
+                            bool useGpu,
+                            bool normByTimes,
+                            LayerPtr dataLayer,
+                            LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+
+  ParameterMap parameterMap;
+
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("warp_ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_blank(numClasses - 1);
+  layerConfig.set_norm_by_times(normByTimes);
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+
+  LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+
+  layer->forward(PASS_GC);
+  layer->backward();
+
+  return layer;
+}
+
+TEST(Layer, WarpCTCLayer) {
+  for (auto layerSize : {10, 64}) {
+    for (auto batchSize : {1, 10, 32}) {
+      for (auto normByTimes : {false, true}) {
+        for (auto useGpu : {false, true}) {
+#ifndef PADDLE_WITH_CUDA
+          if (useGpu) continue;
+#endif
+          LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
+                    << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
+
+          FLAGS_use_gpu = useGpu;
+
+          Argument data0;
+          initArgument(batchSize, layerSize, useGpu, data0);
+
+          Argument data1;
+          data1.resizeAndCopyFrom(data0);
+
+          LayerPtr dataLayer0 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data0);
+          LayerPtr dataLayer1 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data1);
+
+          LayerPtr labelLayer =
+              createLabelLayer("label", batchSize, layerSize, useGpu);
+
+          LayerPtr warpctcLayer = createWarpCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer);
+          LayerPtr ctcLayer = createCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
+
+          /// Check cost
+          LOG(INFO) << "Check cost: "
+                    << checkError(*(warpctcLayer->getOutput().value),
+                                  *(ctcLayer->getOutput().value))
+                    << " different elements.";
+
+          /// Check gradients
+          LOG(INFO) << "Check gradients: "
+                    << checkError(*(dataLayer0->getOutput().grad),
+                                  *(dataLayer1->getOutput().grad))
+                    << " different elements";
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/math/Allocator.h b/paddle/legacy/math/Allocator.h
similarity index 100%
rename from paddle/math/Allocator.h
rename to paddle/legacy/math/Allocator.h
diff --git a/paddle/math/BaseMatrix.cu b/paddle/legacy/math/BaseMatrix.cu
similarity index 100%
rename from paddle/math/BaseMatrix.cu
rename to paddle/legacy/math/BaseMatrix.cu
diff --git a/paddle/math/BaseMatrix.h b/paddle/legacy/math/BaseMatrix.h
similarity index 100%
rename from paddle/math/BaseMatrix.h
rename to paddle/legacy/math/BaseMatrix.h
diff --git a/paddle/legacy/math/CMakeLists.txt b/paddle/legacy/math/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9992ec71f45b592e0a73e1cc9c655e773fa18e86
--- /dev/null
+++ b/paddle/legacy/math/CMakeLists.txt
@@ -0,0 +1,57 @@
+# common package contains:
+#   * the utilities:
+#       * Thread Libs
+#       * Memory Manage libs
+#       * CommandLine Parser
+#       * Logging
+#       * Timer/Stats
+#   * the math libraries:
+#       * Matrix/Vector
+#   * the parameter optimizers.
+#   * the parameter updater functions.
+#
+# TODO(yuyang18): separate libs.
+#
+file(GLOB MATH_HEADERS . *.h)
+file(GLOB MATH_SOURCES . *.cpp)
+
+if(NOT WITH_MKLDNN)
+    set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
+    set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
+    list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
+    list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
+    message(STATUS "Skip compiling with MKLDNNMatrix")
+else()
+    message(STATUS "Compile with MKLDNNMatrix")
+endif()
+
+if(MOBILE_INFERENCE)
+    # Remove sparse
+    list(REMOVE_ITEM MATH_HEADERS
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
+    list(REMOVE_ITEM MATH_SOURCES
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
+endif()
+set(MATH_SOURCES
+    "${PADDLE_SOURCE_DIR}/paddle/legacy/math/BaseMatrix.cu"
+    "${PADDLE_SOURCE_DIR}/paddle/legacy/math/TrainingAlgorithmOp.cu"
+    ${MATH_SOURCES})
+if(NOT WITH_GPU)
+    # then compile BaseMatrix.cu as c++ file
+    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/legacy/math/BaseMatrix.cu")
+    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/legacy/math/TrainingAlgorithmOp.cu")
+    add_library(paddle_math STATIC
+        ${MATH_SOURCES})
+else()
+    cuda_add_library(paddle_math ${MATH_SOURCES})
+endif()
+
+
+add_dependencies(paddle_math paddle_proto ${external_project_dependencies})  # depends
+if(WITH_TESTING)
+    add_subdirectory(tests)
+endif()
diff --git a/paddle/legacy/math/CpuSparseMatrix.cpp b/paddle/legacy/math/CpuSparseMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..88683ec98464561f70b9619e834f4029cfedc91a
--- /dev/null
+++ b/paddle/legacy/math/CpuSparseMatrix.cpp
@@ -0,0 +1,787 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CpuSparseMatrix.h"
+#include "SparseMatrix.h"
+#include "float.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+
+const size_t CpuSparseMatrix::DEFAULT_AVG_WIDTH;
+
+CpuSparseMatrix::CpuSparseMatrix(size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans)
+    : Matrix(NULL, height, width, trans, false) {
+  resize(height, width, nnz, valueType, format);
+}
+
+CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans)
+    : Matrix(dataHandle, height, width, trans, false) {
+  resize(height, width, nnz, valueType, format);
+}
+
+CpuSparseMatrix::CpuSparseMatrix(real* data,
+                                 int* rows,
+                                 int* cols,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans)
+    : Matrix(NULL, height, width, trans, false) {
+  cols_ = cols;
+  rows_ = rows;
+  value_ = data;
+  height_ = height;
+  width_ = width;
+  elementCnt_ = nnz;
+  valueType_ = valueType;
+  format_ = format;
+}
+
+void CpuSparseMatrix::resize(size_t newHeight,
+                             size_t newWidth,
+                             size_t newNnz,
+                             SparseValueType valueType,
+                             SparseFormat format) {
+  CHECK_LE(newNnz, newHeight * newWidth);
+  size_t newSize = 0;
+  if (format == SPARSE_CSR) {
+    newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
+  } else {
+    newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
+  }
+
+  if (NO_VALUE != valueType) {
+    newSize += newNnz * sizeof(real);
+  }
+
+  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
+    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
+  }
+
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newNnz;
+  valueType_ = valueType;
+  format_ = format;
+  sparseResize();
+}
+void CpuSparseMatrix::sparseResize() {
+  if (format_ == SPARSE_CSR) {
+    rows_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()));
+    cols_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+        (height_ + 1) * sizeof(int));
+    if (NO_VALUE != valueType_) {
+      value_ = reinterpret_cast<real*>(
+          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+    } else {
+      value_ = NULL;
+    }
+  } else {
+    cols_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()));
+    rows_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+        (width_ + 1) * sizeof(int));
+    if (NO_VALUE != valueType_) {
+      value_ = reinterpret_cast<real*>(
+          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+    } else {
+      value_ = NULL;
+    }
+  }
+}
+
+void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
+  resize(newHeight,
+         newWidth,
+         newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
+         valueType_,
+         format_);
+}
+
+MatrixPtr CpuSparseMatrix::getTranspose() {
+  if (!memoryHandle_ && !value_) {
+    MatrixPtr dest(new CpuSparseMatrix(
+        height_, width_, elementCnt_, valueType_, format_, true));
+    return dest;
+  } else if (memoryHandle_) {
+    MatrixPtr dest(new CpuSparseMatrix(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
+        height_,
+        width_,
+        elementCnt_,
+        valueType_,
+        format_,
+        true));
+    return dest;
+  } else if (value_) {
+    MatrixPtr dest(new CpuSparseMatrix(value_,
+                                       rows_,
+                                       cols_,
+                                       height_,
+                                       width_,
+                                       elementCnt_,
+                                       valueType_,
+                                       format_,
+                                       true));
+    return dest;
+  } else {
+    return NULL;
+  }
+}
+
+SparseValueType CpuSparseMatrix::getValueType() { return valueType_; }
+
+void CpuSparseMatrix::mul(const Matrix& a,
+                          const Matrix& b,
+                          real scaleAB,
+                          real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
+
+  if (a_ptr && b_ptr) {
+    CpuMatrix::mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, this, scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void CpuSparseMatrix::add3(CpuMatrix* b) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK(height_ == b->getHeight());
+  CHECK(width_ == b->getWidth());
+  real* A = getValue();
+  real* B = b->getData();
+  int* cols = getCols();
+  for (size_t i = 0; i < height_; i++) {
+    size_t start = getRowStartIdx(i);
+    size_t end = getRowStartIdx(i + 1);
+    for (size_t j = start; j < end; j++) {
+      A[j] = B[i * width_ + cols[j]];
+    }
+  }
+}
+
+void CpuSparseMatrix::add3(MatrixPtr b) {
+  if (dynamic_cast<CpuMatrix*>(b.get())) {
+    add3(dynamic_cast<CpuMatrix*>(b.get()));
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void CpuSparseMatrix::addBias(Matrix& b, real scale) {
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getData();
+  int* cols = getCols();
+  size_t nnz = getElementCnt();
+  for (size_t i = 0; i < nnz; i++) {
+    A[i] += scale * B[cols[i]];
+  }
+}
+
+template <class T>
+void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
+  os << "\n: " << name << " [";
+  for (size_t i = 0; i < len; i++) {
+    os << a[i] << " ";
+  }
+  os << "]\n";
+}
+
+void CpuSparseMatrix::print(std::ostream& os) const {
+  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
+  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
+  printBuf(os, rows_, rowSize, "row");
+  printBuf(os, cols_, colSize, "col");
+  if (valueType_ == FLOAT_VALUE) {
+    printBuf(os, value_, elementCnt_, "value");
+  }
+  return;
+}
+
+void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
+  CHECK_LT(idx, height_);
+  if (format_ == SPARSE_CSC) {
+    LOG(FATAL) << "SPARSE_CSC not supported";
+    return;
+  }
+
+  const int* col = getRowCols(idx);
+  size_t num = getColNum(idx);
+  if (num > 0) {
+    if (valueType_ == FLOAT_VALUE) {
+      const real* data = getRowValues(idx);
+      os << col[0] << ":" << data[0];
+      for (size_t i = 1; i < num; ++i) {
+        os << " " << col[i] << ":" << data[i];
+      }
+    } else {
+      os << col[0];
+      for (size_t i = 1; i < num; ++i) {
+        os << " " << col[i];
+      }
+    }
+  }
+  os << ";";
+}
+
+void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getValue();
+  if (b.getValueType() == FLOAT_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = B[j] * c.getElement(i, cCol);
+      }
+    }
+  } else if (b.getValueType() == NO_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = c.getElement(i, cCol);
+      }
+    }
+  }
+}
+
+void CpuSparseMatrix::randomizeUniform() {
+  CHECK_LE(elementCnt_, height_ * width_);
+  if (valueType_ == FLOAT_VALUE) {
+    real* data = getValue();
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      *data++ = rand() / static_cast<real>(RAND_MAX);  // NOLINT
+    }
+  }
+  if (format_ == SPARSE_CSR) {
+    sparseRand(rows_, cols_, elementCnt_, height_ + 1, width_, false);
+  } else {
+    sparseRand(cols_, rows_, elementCnt_, width_ + 1, height_, false);
+  }
+}
+
+void CpuSparseMatrix::copyFrom(std::vector<int>& rows,
+                               std::vector<int>& cols,
+                               std::vector<real>& values) {
+  size_t size = format_ == SPARSE_CSR ? cols.size() : rows.size();
+  resize(height_, width_, size, valueType_, format_);
+  if (valueType_ == FLOAT_VALUE) {
+    memcpy(&value_[0], &values[0], sizeof(real) * values.size());
+  }
+  memcpy(&cols_[0], &cols[0], sizeof(int) * cols.size());
+  memcpy(&rows_[0], &rows[0], sizeof(int) * rows.size());
+}
+
+// Copy from a CpuMatrix, only supported in sparse_float_value_t
+// SparseMatrix.
+void CpuSparseMatrix::copyFrom(const CpuMatrix& src) {
+  CHECK_EQ(getHeight(), src.getHeight());
+  CHECK_EQ(getWidth(), src.getWidth());
+  CHECK(!src.trans_ && !trans_);
+  if (format_ == SPARSE_CSR) {
+    std::vector<int> rows(getHeight() + 1);
+    std::vector<int> cols;
+    std::vector<real> values;
+    rows[0] = 0;
+    for (size_t r = 0; r < getHeight(); ++r) {
+      for (size_t c = 0; c < getWidth(); ++c) {
+        real v = src.getElement(r, c);
+        if (fabs(v) > FLT_EPSILON) {
+          cols.push_back(c);
+          values.push_back(v);
+        }
+      }
+      rows[r + 1] = values.size();
+    }
+    copyFrom(rows, cols, values);
+  } else {
+    std::vector<int> cols(getWidth() + 1);
+    std::vector<int> rows;
+    std::vector<real> values;
+    cols[0] = 0;
+    for (size_t r = 0; r < getWidth(); ++r) {
+      for (size_t c = 0; c < getHeight(); ++c) {
+        real v = src.getElement(c, r);
+        if (fabs(v) > FLT_EPSILON) {
+          rows.push_back(c);
+          values.push_back(v);
+        }
+      }
+      cols[r + 1] = values.size();
+    }
+    copyFrom(rows, cols, values);
+  }
+}
+
+MatrixPtr CpuSparseMatrix::clone(size_t height, size_t width, bool useGpu) {
+  if (height == 0 && width == 0) {
+    height = height_;
+    width = width_;
+  }
+  CHECK(width && height);
+  if (!useGpu) {
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, 0, valueType_, format_);
+  } else {
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, elementCnt_, valueType_, format_);
+  }
+}
+
+MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
+  CHECK_LE(startRow + numRows, height_);
+  CHECK_EQ(format_, SPARSE_CSR);
+  if (valueType_ == NO_VALUE) {
+    return std::make_shared<CpuSparseMatrix>(
+        nullptr,
+        rows_ + startRow,
+        cols_,
+        numRows,
+        width_,
+        rows_[startRow + numRows] - rows_[startRow],
+        valueType_,
+        format_,
+        trans_);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(
+        value_,
+        rows_ + startRow,
+        cols_,
+        numRows,
+        width_,
+        rows_[startRow + numRows] - rows_[startRow],
+        valueType_,
+        format_,
+        trans_);
+  }
+}
+
+/* mem MUST be alloced outside (memAlloc=false) */
+void CpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
+  CHECK(!memAlloc);
+  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(matTrans.get());
+  if (format_ == SPARSE_CSR) {
+    /*statistic element number in each col*/
+    int* colCounters = mat->getRows() + 1;
+    memset(colCounters, 0, sizeof(int) * width_);
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      int col = cols_[i];
+      colCounters[col]++;
+    }
+    /*fill mat rows */
+    mat->getRows()[0] = 0;
+    for (size_t i = 1; i < width_ + 1; i++) {
+      mat->getRows()[i] = mat->getRows()[i - 1] + mat->getRows()[i];
+    }
+    /*fill mat values and cols*/
+    std::vector<int> colNumVec(width_, 0);
+    if (valueType_ == FLOAT_VALUE) {
+      for (size_t i = 0; i < height_; i++) {
+        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
+          int colIdx = cols_[j];
+          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
+          mat->getCols()[index] = i;
+          mat->getValue()[index] = value_[j];
+          colNumVec[colIdx]++;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < height_; i++) {
+        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
+          int colIdx = cols_[j];
+          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
+          mat->getCols()[index] = i;
+          colNumVec[colIdx]++;
+        }
+      }
+    }
+  } else {
+    /*statistic element number in each row*/
+    int* rowCounters = mat->getCols() + 1;
+    memset(rowCounters, 0, sizeof(int) * height_);
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      int row = rows_[i];
+      rowCounters[row]++;
+    }
+
+    /*fill mat cols */
+    mat->getCols()[0] = 0;
+    for (size_t i = 1; i < height_ + 1; i++) {
+      mat->getCols()[i] = mat->getCols()[i - 1] + mat->getCols()[i];
+    }
+    /*fill mat values and rows*/
+    std::vector<int> rowNumVec(height_, 0);
+    if (valueType_ == FLOAT_VALUE) {
+      for (size_t i = 0; i < width_; i++) {
+        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
+          int rowIdx = rows_[j];
+          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
+          mat->getRows()[index] = i;
+          mat->getValue()[index] = value_[j];
+          rowNumVec[rowIdx]++;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < width_; i++) {
+        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
+          int rowIdx = rows_[j];
+          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
+          mat->getRows()[index] = i;
+          rowNumVec[rowIdx]++;
+        }
+      }
+    }
+  }
+}
+
+void CpuSparseMatrix::setRow(size_t row,
+                             size_t colNum,
+                             const unsigned int* cols,
+                             const real* values) {
+  if (format_ == SPARSE_CSR) {
+    CHECK_LT(row, height_);
+    CHECK(NULL != cols);
+    if (0 == row) {
+      rows_[row] = 0;
+    }
+    rows_[row + 1] = rows_[row] + colNum;
+    for (size_t i = 0; i < colNum; ++i) {
+      cols_[rows_[row] + i] = cols[i];
+    }
+    if (valueType_ == NO_VALUE) {
+      CHECK(!values);
+    } else {
+      for (size_t i = 0; i < colNum; ++i) {
+        value_[rows_[row] + i] = values[i];
+      }
+    }
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void CpuSparseMatrix::fillRowIndices(IVectorPtr& outVec) const {
+  if (format_ == SPARSE_CSR) {
+    auto nnz = getElementCnt();
+    IVector::resizeOrCreate(outVec, nnz, false);
+    auto out = outVec->getData();
+    int* rows = getRows();
+    for (size_t i = 0; i < height_; i++) {
+      for (int j = rows[i]; j < rows[i + 1]; j++) {
+        out[j] = i;
+      }
+    }
+  } else {
+    LOG(FATAL) << "SPARSE_CSC not supported";
+  }
+}
+
+ThreadLocal<std::vector<CpuSparseMatrixPtr>> CpuSparseMatrix::cpuLocalMats_;
+
+CpuSparseMatrixPtr CpuSparseMatrix::getTmpSparseMatrix(size_t height,
+                                                       size_t width) {
+  std::vector<CpuSparseMatrixPtr>* localMats = cpuLocalMats_.get();
+  auto it = localMats->begin();
+  while (it != localMats->end()) {
+    if (it->unique()) {
+      (*it)->resize(height, width, elementCnt_, valueType_, format_);
+      return *it;
+    }
+  }
+  localMats->emplace_back(std::make_shared<CpuSparseMatrix>(
+      height, width, elementCnt_, valueType_, format_, false));
+  return localMats->back();
+}
+
+void CpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  if (dynamic_cast<const GpuSparseMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const GpuSparseMatrix*>(&src);
+    copyFrom(*tmpSrc, stream);
+  } else if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else {
+    LOG(FATAL) << "not implemented";
+  }
+}
+
+void CpuSparseMatrix::copyFrom(const Matrix& src) {
+  if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else {
+    LOG(FATAL) << "not implemented";
+  }
+}
+
+void CpuSparseMatrix::copyFrom(const GpuSparseMatrix& src, hl_stream_t stream) {
+  CHECK_EQ(height_, src.getHeight());
+  CHECK_EQ(width_, src.getWidth());
+  CHECK_EQ(size_t(elementCnt_), src.getElementCnt());
+  size_t valSize = valueType_ == NO_VALUE ? 0 : elementCnt_;
+  if (format_ == SPARSE_CSC)
+    hl_memcpy_from_csc_matrix(value_,
+                              valSize,
+                              rows_,
+                              elementCnt_,
+                              cols_,
+                              width_ + 1,
+                              src.sMatrix_.get(),
+                              stream);
+  else
+    hl_memcpy_from_csr_matrix(value_,
+                              valSize,
+                              rows_,
+                              height_ + 1,
+                              cols_,
+                              elementCnt_,
+                              src.sMatrix_.get(),
+                              stream);
+}
+
+void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
+  CHECK_EQ(height_, src.getHeight());
+  CHECK_EQ(width_, src.getWidth());
+  CHECK_EQ(format_, src.getFormat());
+  int start = format_ == SPARSE_CSR ? src.getRows()[0] : src.getCols()[0];
+  if (format_ == SPARSE_CSR) {
+    size_t totalColNum = 0;
+    for (size_t i = 0; i < height_; ++i) {
+      totalColNum += src.getColNum(i);
+    }
+    resize(height_, width_, totalColNum, valueType_, format_);
+    rows_[0] = 0;
+    for (size_t i = 0; i < height_; ++i) {
+      rows_[i + 1] = rows_[i] + src.getColNum(i);
+    }
+    memcpy(cols_, src.getCols() + start, totalColNum * sizeof(int));
+  } else {
+    size_t totalColNum = 0;
+    for (size_t i = 0; i < width_; ++i) {
+      totalColNum += src.getRowNum(i);
+    }
+    resize(height_, width_, totalColNum, valueType_, format_);
+    cols_[0] = 0;
+    for (size_t i = 0; i < width_; ++i) {
+      cols_[i + 1] = cols_[i] + src.getRowNum(i);
+    }
+    memcpy(rows_, src.getRows() + start, totalColNum * sizeof(int));
+  }
+
+  // if have different value type, only copy rows and cols
+  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
+    memcpy(value_, src.getValue() + start, elementCnt_ * sizeof(real));
+  }
+}
+
+void CpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
+                              const sparse_non_value_t* row) {
+  for (size_t j = 0; j < colNum; j++) {
+    cols_[offsets + j] = row[j].col;
+  }
+}
+
+void CpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
+                              const sparse_float_value_t* row) {
+  for (size_t j = 0; j < colNum; j++) {
+    cols_[offsets + j] = row[j].col;
+    value_[offsets + j] = row[j].value;
+  }
+}
+
+template <class T>
+void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data) {
+  size_t totalColNum = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    int64_t id = ids[i];
+    totalColNum += indices[id + 1] - indices[id];
+  }
+  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
+
+  resize(height_, width_, totalColNum, valueType_, format_);
+
+  rows_[0] = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    int64_t id = ids[i];
+    T* row = data + indices[id];
+    size_t colNum = indices[id + 1] - indices[id];
+    rows_[i + 1] = rows_[i] + colNum;
+    copyRow(rows_[i], colNum, row);
+  }
+}
+
+template <class T>
+void CpuSparseMatrix::copyFrom(int64_t* indices, T* data) {
+  CHECK(format_ == SPARSE_CSR);
+  size_t totalColNum = indices[height_] - indices[0];
+  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
+  resize(height_, width_, totalColNum, valueType_, format_);
+
+  rows_[0] = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    T* row = data + indices[i];
+    size_t colNum = indices[i + 1] - indices[i];
+    rows_[i + 1] = rows_[i] + colNum;
+    copyRow(rows_[i], colNum, row);
+  }
+}
+
+void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
+  CHECK_EQ(height_, src.getHeight());
+  CHECK_LE(width_, src.getWidth());
+  CHECK_EQ(format_, src.getFormat());
+  CHECK_EQ(valueType_, src.getValueType());
+  if (format_ == SPARSE_CSR) {
+    int* srcCols = src.getCols();
+    size_t numLessWidth =
+        std::count_if(srcCols, srcCols + src.getElementCnt(), [this](size_t n) {
+          return n < this->width_;
+        });
+    resize(height_, width_, numLessWidth, valueType_, format_);
+    rows_[0] = 0;
+    size_t index = 0;
+    for (size_t r = 0; r < height_; ++r) {
+      for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
+        if (srcCols[i] < static_cast<int>(width_)) {
+          cols_[index] = srcCols[i];
+          if (valueType_ == FLOAT_VALUE) {
+            value_[index] = src.getValue()[i];
+          }
+          ++index;
+        }
+      }
+      rows_[r + 1] = index;
+    }
+    CHECK_EQ(index, numLessWidth);
+  } else {
+    size_t numLessWidth = src.getCols()[width_] - src.getCols()[0];
+    resize(height_, width_, numLessWidth, valueType_, format_);
+    cols_[0] = 0;
+    size_t index = 0;
+    // note: c < width_, not src.getWidth();
+    for (size_t c = 0; c < width_; ++c) {
+      for (int i = src.getCols()[c]; i < src.getCols()[c + 1]; ++i) {
+        rows_[index] = src.getRows()[i];
+        if (valueType_ == FLOAT_VALUE) {
+          value_[index] = src.getValue()[i];
+        }
+        ++index;
+      }
+      cols_[c + 1] = index;
+    }
+    CHECK_EQ(index, numLessWidth);
+  }
+}
+
+void CpuSparseMatrix::zeroMem() {
+  CHECK(valueType_ == FLOAT_VALUE);
+  memset(value_, 0, elementCnt_ * sizeof(real));
+}
+
+template void CpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
+                                        sparse_non_value_t* data);
+
+template void CpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
+                                        sparse_float_value_t* data);
+
+template void CpuSparseMatrix::copyFrom(int64_t* indices,
+                                        sparse_non_value_t* data);
+
+template void CpuSparseMatrix::copyFrom(int64_t* indices,
+                                        sparse_float_value_t* data);
+
+void CpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+  maxVal.zeroMem();
+  int* outids = maxIds.getData();
+  real* outvalues = maxVal.getData();
+
+  typedef std::pair<real, size_t> valuepair;
+  std::vector<valuepair> vec;
+  for (size_t i = 0; i < numSamples; i++) {
+    vec.clear();
+
+    auto num = getColNum(i);
+    auto ids = getRowCols(i);
+    auto values = getRowValues(i);
+    for (size_t j = 0; j < num; j++) {
+      vec.push_back(std::make_pair(values[j], ids[j]));
+    }
+
+    size_t outsize = std::min(num, beam);
+    std::partial_sort(vec.begin(),
+                      vec.begin() + outsize,
+                      vec.end(),
+                      [](const valuepair& a, const valuepair& b) {
+                        return a.first > b.first;
+                      });
+    for (size_t j = 0; j < outsize; j++) {
+      outids[i * beam + j] = vec[j].second;
+      outvalues[i * beam + j] = vec[j].first;
+    }
+    if (outsize < beam) {
+      // if the number of values to sort are less than the output size,
+      // use -1 to indicate the end of valid sorted values.
+      outids[i * beam + outsize] = -1;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/legacy/math/CpuSparseMatrix.h
similarity index 100%
rename from paddle/math/CpuSparseMatrix.h
rename to paddle/legacy/math/CpuSparseMatrix.h
diff --git a/paddle/math/ExecViaCpu.h b/paddle/legacy/math/ExecViaCpu.h
similarity index 100%
rename from paddle/math/ExecViaCpu.h
rename to paddle/legacy/math/ExecViaCpu.h
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/legacy/math/MKLDNNMatrix.cpp
similarity index 100%
rename from paddle/math/MKLDNNMatrix.cpp
rename to paddle/legacy/math/MKLDNNMatrix.cpp
diff --git a/paddle/legacy/math/MKLDNNMatrix.h b/paddle/legacy/math/MKLDNNMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a0e5f85923dfd822dad4c63679acde63719f217
--- /dev/null
+++ b/paddle/legacy/math/MKLDNNMatrix.h
@@ -0,0 +1,256 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Matrix.h"
+#include "mkldnn.hpp"
+#include "paddle/legacy/parameter/Parameter.h"
+
+namespace paddle {
+
+class MKLDNNMatrix;
+typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
+
+#define CHECK_PRIMITIVE_DESC_EQ(MAT, PD, ...)                        \
+  CHECK(MAT) << " can not be empty.";                                \
+  CHECK(MAT->getPrimitiveDesc() == PD)                               \
+      << #MAT "->getPrimitiveDesc() and " #PD " should be equal.\n " \
+      << "" __VA_ARGS__;
+
+/**
+ * @brief MKLDNN Matrix.
+ *
+ */
+class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
+ public:
+  MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
+      : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
+        mkldnn::memory(pd, m->getData()),
+        m_(m) {}
+
+  ~MKLDNNMatrix() {}
+
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
+   */
+  static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
+                                MatrixPtr m = nullptr);
+
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory details info
+   */
+  static MKLDNNMatrixPtr create(
+      mkldnn::memory::dims dims,
+      mkldnn::memory::format fmt,
+      mkldnn::engine& eg,
+      MatrixPtr m = nullptr,
+      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
+
+  /**
+   * Create primitive descriptor.
+   * default with f32 dtype
+   */
+  static mkldnn::memory::primitive_desc createPrimitiveDesc(
+      const mkldnn::memory::dims dims,
+      const mkldnn::memory::format& fmt,
+      const mkldnn::engine& eg,
+      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
+    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
+  }
+
+  /**
+   * Create Memory descriptor.
+   * default with any format and f32 dtype
+   */
+  static mkldnn::memory::desc createMemoryDesc(
+      const mkldnn::memory::dims dims,
+      const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
+      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
+    return mkldnn::memory::desc(dims, dtype, fmt);
+  }
+
+  /**
+   * Create reorder primitive.
+   * Create a mkldnn::reorder handle for converting src MKLDNNMatrix to dst.
+   * checkData: whether to check the data handle of src and dst.
+   *            if true, it will check the data and do not allow them equal;
+   *            otherwise, it will not check them, then the reorder created
+   *            may have inplace buffer.
+   *            Do not set false, if you can not guarantee the inplace logical
+   *            would work with your reorder.
+   */
+  static std::shared_ptr<mkldnn::reorder> createReorder(
+      const MKLDNNMatrixPtr& src,
+      const MKLDNNMatrixPtr& dst,
+      bool checkData = true);
+
+  void copyFrom(const Matrix& src) {
+    // TODO(TJ): reorder data if this format is not nchw or x
+    m_->copyFrom(src);
+  }
+
+  void copyTo(Matrix& dst) {
+    // TODO(TJ): reorder data if this format is not nchw or x
+    dst.copyFrom(*m_);
+  }
+
+ public:
+  /**
+   * Reorder this MKLDNNMatrix from other format.
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change this original dim or format info
+   */
+  void reorderDataFrom(const MKLDNNMatrixPtr& m,
+                       memory::format srcFmt,
+                       memory::dims targetDim);
+
+  /**
+   * Reorder this MKLDNNMatrix to other format.
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change the dst dim or format info
+   */
+  void reorderDataTo(const MKLDNNMatrixPtr& m,
+                     memory::format dstFmt,
+                     memory::dims targetDim);
+
+  /**
+   * Dimensionality reduction.
+   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
+   */
+  void downSpatial();
+
+  /**
+   * set the memory data handle.
+   * Caution: This will not check the buffer size of the data,
+   *          it should be coverd by user.
+   */
+  void setData(real* data) {
+    set_data_handle(data);
+    CpuMatrix::setData(data);
+    m_.reset();
+  }
+
+  /**
+   * override the CpuMatrix::resize
+   */
+  void resize(size_t newHeight, size_t newWidth) override {
+    m_->resize(newHeight, newWidth);
+    if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
+      return;
+    }
+    CpuMatrix::setData(data_);
+    height_ = newHeight;
+    width_ = newWidth;
+    elementCnt_ = newHeight * newWidth;
+    stride_ = width_;
+    auto pd = mkldnn::memory::primitive_desc(
+        mkldnn::memory::desc({(int)newHeight, (int)newWidth},
+                             getDtype(),
+                             mkldnn::memory::format::nc),
+        getEngine());
+    resetMKLDNNMemory(pd, data_);
+  }
+
+  /**
+   * override Matrix::getData
+   * check data before return
+   */
+  real* getData() override {
+    CHECK_EQ((void*)data_, get_data_handle());
+    return data_;
+  }
+
+  const real* getData() const override {
+    CHECK_EQ((void*)data_, get_data_handle());
+    return data_;
+  }
+
+  /**
+   * Get primitive descriptor.
+   */
+  mkldnn::memory::primitive_desc getPrimitiveDesc() {
+    return this->get_primitive_desc();
+  }
+
+  /**
+   * Get memory descriptor.
+   */
+  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
+
+  /**
+   * Get dimensions.
+   */
+  mkldnn::memory::dims getDims() {
+    mkldnn::memory::desc md = getMemoryDesc();
+    const int* src = md.data.dims;
+    int ndims = md.data.ndims;
+    mkldnn::memory::dims dst;
+    dst.resize(ndims);
+    for (int i = 0; i < ndims; ++i) {
+      dst[i] = src[i];
+    }
+    return dst;
+  }
+
+  /**
+   * Get format.
+   */
+  mkldnn::memory::format getFormat() {
+    return (mkldnn::memory::format)(getMemoryDesc().data.format);
+  }
+
+  /**
+   * Get memory data type.
+   */
+  mkldnn::memory::data_type getDtype() {
+    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
+  }
+
+  /**
+   * Get engine.
+   */
+  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
+
+ protected:
+  /**
+   * Do reorder once.
+   * Can support inplace.
+   */
+  void reorderOnce(void* srcData,
+                   void* dstData,
+                   memory::format srcFmt,
+                   memory::format dstFmt,
+                   memory::dims dm);
+  /**
+   * reset this MKLDNN Memory from primitve desc
+   */
+  void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
+    mkldnn_primitive_t result;
+    mkldnn::error::wrap_c_api(
+        mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
+        "could not create a memory primitive");
+    reset(result);
+    set_data_handle(data);
+  }
+
+ private:
+  // save the CpuMatrixPtr in case the buffer released outside
+  CpuMatrixPtr m_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/MathFunctions.cpp b/paddle/legacy/math/MathFunctions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..152aeb5d645a58df6b6d078ce25f5921f6f1ba58
--- /dev/null
+++ b/paddle/legacy/math/MathFunctions.cpp
@@ -0,0 +1,348 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/math/MathFunctions.h"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
+#include "paddle/utils/DynamicLoader.h"
+
+namespace dynload {
+
+std::once_flag lapack_dso_flag;
+void* lapack_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load lapack routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+
+// The argument for stringizing operator is not macro-expanded first.
+// We have to use two levels of macro to do the expansion.
+// See https://gcc.gnu.org/onlinedocs/cpp/Stringizing.html
+#define STR(x) #x
+
+// clang-format off
+#ifndef LAPACK_FOUND
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      using lapack_func = decltype(__name(args...)) (*)(Args...);              \
+      std::call_once(lapack_dso_flag, GetLapackDsoHandle, &lapack_dso_handle); \
+      void* p_##__name = dlsym(lapack_dso_handle, STR(__name));                \
+      CHECK(p_##__name) << "Cannot find symbol " << STR(__name)                \
+                        << " in liblapack.so";                                 \
+      return reinterpret_cast<lapack_func>(p_##__name)(args...);               \
+    }                                                                          \
+  } __name;  // struct DynLoad__##__name
+#else
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      return __name(args...);                                                  \
+    }                                                                          \
+  } __name;  // struct DynLoad__##__name
+#endif
+
+#define  PADDLE_SGETRF  LAPACKE_sgetrf
+#define  PADDLE_DGETRF  LAPACKE_dgetrf
+#define  PADDLE_SGETRI  LAPACKE_sgetri
+#define  PADDLE_DGETRI  LAPACKE_dgetri
+
+#define LAPACK_ROUTINE_EACH(__macro)       \
+  __macro(PADDLE_SGETRF)                   \
+  __macro(PADDLE_DGETRF)                   \
+  __macro(PADDLE_SGETRI)                   \
+  __macro(PADDLE_DGETRI)
+// clang-format on
+
+LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
+
+}  // namespace dynload
+
+namespace paddle {
+
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
+template <>
+void gemm<float>(const CBLAS_TRANSPOSE transA,
+                 const CBLAS_TRANSPOSE transB,
+                 const int M,
+                 const int N,
+                 const int K,
+                 const float alpha,
+                 const float* A,
+                 const int lda,
+                 const float* B,
+                 const int ldb,
+                 const float beta,
+                 float* C,
+                 const int ldc) {
+  cblas_sgemm(CblasRowMajor,
+              transA,
+              transB,
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              lda,
+              B,
+              ldb,
+              beta,
+              C,
+              ldc);
+}
+
+template <>
+void gemm<double>(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE transB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const double alpha,
+                  const double* A,
+                  const int lda,
+                  const double* B,
+                  const int ldb,
+                  const double beta,
+                  double* C,
+                  const int ldc) {
+  cblas_dgemm(CblasRowMajor,
+              transA,
+              transB,
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              lda,
+              B,
+              ldb,
+              beta,
+              C,
+              ldc);
+}
+#endif
+
+template <>
+int getrf<float>(const CBLAS_ORDER order,
+                 const int M,
+                 const int N,
+                 float* A,
+                 const int lda,
+                 int* ipiv) {
+  return dynload::PADDLE_SGETRF(order, M, N, A, lda, ipiv);
+}
+
+template <>
+int getrf<double>(const CBLAS_ORDER order,
+                  const int M,
+                  const int N,
+                  double* A,
+                  const int lda,
+                  int* ipiv) {
+  return dynload::PADDLE_DGETRF(order, M, N, A, lda, ipiv);
+}
+
+template <>
+int getri<float>(const CBLAS_ORDER order,
+                 const int N,
+                 float* A,
+                 const int lda,
+                 const int* ipiv) {
+  return dynload::PADDLE_SGETRI(order, N, A, lda, ipiv);
+}
+
+template <>
+int getri<double>(const CBLAS_ORDER order,
+                  const int N,
+                  double* A,
+                  const int lda,
+                  const int* ipiv) {
+  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
+}
+
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
+template <>
+void axpy<float>(const int n, const float alpha, const float* x, float* y) {
+  cblas_saxpy(n, alpha, x, 1, y, 1);
+}
+
+template <>
+void axpy<double>(const int n, const double alpha, const double* x, double* y) {
+  cblas_daxpy(n, alpha, x, 1, y, 1);
+}
+
+template <>
+float dotProduct<float>(const int n, const float* x, const float* y) {
+  return cblas_sdot(n, x, 1, y, 1);
+}
+
+template <>
+double dotProduct<double>(const int n, const double* x, const double* y) {
+  return cblas_ddot(n, x, 1, y, 1);
+}
+#endif
+
+#if defined(PADDLE_WITH_MKLML)
+
+template <>
+void vExp<float>(const int n, const float* a, float* r) {
+  vsExp(n, a, r);
+}
+
+template <>
+void vExp<double>(const int n, const double* a, double* r) {
+  vdExp(n, a, r);
+}
+
+template <>
+void vPow<float>(const int n, const float* a, const float b, float* r) {
+  vsPowx(n, a, b, r);
+}
+
+template <>
+void vPow<double>(const int n, const double* a, const double b, double* r) {
+  vdPowx(n, a, b, r);
+}
+
+template <>
+void vLog<float>(const int n, const float* a, float* r) {
+  vsLn(n, a, r);
+}
+
+template <>
+void vLog<double>(const int n, const double* a, double* r) {
+  vdLn(n, a, r);
+}
+
+template <>
+void vAdd<float>(const int n, const float* a, const float* b, float* r) {
+  vsAdd(n, a, b, r);
+}
+
+template <>
+void vAdd<double>(const int n, const double* a, const double* b, double* r) {
+  vdAdd(n, a, b, r);
+}
+
+template <>
+void vTanh<float>(const int n, const float* a, float* r) {
+  vsTanh(n, a, r);
+}
+
+template <>
+void vTanh<double>(const int n, const double* a, double* r) {
+  vdTanh(n, a, r);
+}
+
+template <>
+void vInvSqrt<float>(const int n, const float* a, float* r) {
+  vsInvSqrt(n, a, r);
+}
+
+template <>
+void vInvSqrt<double>(const int n, const double* a, double* r) {
+  vdInvSqrt(n, a, r);
+}
+
+template <>
+void vLog1p<float>(const int n, const float* a, float* r) {
+  vsLog1p(n, a, r);
+}
+
+template <>
+void vLog1p<double>(const int n, const double* a, double* r) {
+  vdLog1p(n, a, r);
+}
+#else
+
+DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
+template <class T>
+void vExp(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
+      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
+template <class T>
+void vLog(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
+      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
+template <class T>
+void vPow(const int n, const T* a, const T b, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
+      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
+template <class T>
+void vAdd(const int n, const T* a, const T* b, T* r) {
+  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
+                                                     const_cast<T*>(a),
+                                                     const_cast<T*>(b),
+                                                     r,
+                                                     1,
+                                                     n,
+                                                     n,
+                                                     n,
+                                                     n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
+template <class T>
+void vInvSqrt(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vInvSqrt<T>, 0, 0>(
+      binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a));
+template <class T>
+void vLog1p(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vLog1p<T>, 0, 0>(
+      binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vTanh, T tmp = -2.0 * a;
+                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
+template <class T>
+void vTanh(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
+      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+template void vExp(const int n, const float* a, float* r);
+template void vExp(const int n, const double* a, double* r);
+template void vLog(const int n, const float* a, float* r);
+template void vLog(const int n, const double* a, double* r);
+template void vPow(const int n, const float* a, const float b, float* r);
+template void vPow(const int n, const double* a, const double b, double* r);
+template void vAdd(const int n, const float* a, const float* b, float* r);
+template void vAdd(const int n, const double* a, const double* b, double* r);
+template void vInvSqrt(const int n, const double* a, double* r);
+template void vInvSqrt(const int n, const float* a, float* r);
+template void vLog1p(const int n, const float* a, float* r);
+template void vLog1p(const int n, const double* a, double* r);
+template void vTanh(const int n, const float* a, float* r);
+template void vTanh(const int n, const double* a, double* r);
+#endif
+}  // namespace paddle
diff --git a/paddle/math/MathFunctions.h b/paddle/legacy/math/MathFunctions.h
similarity index 100%
rename from paddle/math/MathFunctions.h
rename to paddle/legacy/math/MathFunctions.h
diff --git a/paddle/math/MathUtils.cpp b/paddle/legacy/math/MathUtils.cpp
similarity index 100%
rename from paddle/math/MathUtils.cpp
rename to paddle/legacy/math/MathUtils.cpp
diff --git a/paddle/math/MathUtils.h b/paddle/legacy/math/MathUtils.h
similarity index 100%
rename from paddle/math/MathUtils.h
rename to paddle/legacy/math/MathUtils.h
diff --git a/paddle/legacy/math/Matrix.cpp b/paddle/legacy/math/Matrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..50b0bc501148be260464fbec4694f2f5565ce6ad
--- /dev/null
+++ b/paddle/legacy/math/Matrix.cpp
@@ -0,0 +1,4787 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Matrix.h"
+#include "MathFunctions.h"
+#include "SparseMatrix.h"
+#include "SparseRowMatrix.h"
+
+#include <float.h>
+#include <algorithm>
+#include <cmath>
+
+#include <string.h>
+#include "hl_cnn.h"
+#include "hl_gpu.h"
+#include "hl_table_apply.h"
+#include "hl_top_k.h"
+#include "paddle/utils/Logging.h"
+
+#include "NEONFunctions.h"
+#include "paddle/legacy/function/GemmFunctor.h"
+#include "paddle/utils/ThreadLocal.h"
+
+#include "SIMDFunctions.h"
+
+namespace paddle {
+
+inline real _pow(real a, real beta) { return std::pow(a, beta); }
+
+inline real _square(real a) { return a * a; }
+
+inline real _safelog(real a) { return a > 0.0f ? std::log(a) : -40.0f; }
+
+Matrix::Matrix(MemoryHandlePtr memHandle,
+               size_t height,
+               size_t width,
+               bool trans,
+               bool use_gpu)
+    : BaseMatrix(
+          height,
+          width,
+          memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
+          trans,
+          use_gpu) {
+  elementCnt_ = width * height;
+  memoryHandle_ = memHandle;
+}
+
+Matrix::Matrix(
+    real* data, size_t height, size_t width, bool trans, bool use_gpu)
+    : BaseMatrix(height, width, data, trans, use_gpu) {
+  elementCnt_ = width * height;
+}
+
+Matrix::Matrix(real* data,
+               size_t height,
+               size_t width,
+               size_t stride,
+               bool trans,
+               bool use_gpu)
+    : BaseMatrix(height, width, stride, data, trans, use_gpu) {
+  elementCnt_ = width * height;
+}
+
+MatrixPtr Matrix::createSparseMatrix(real* data,
+                                     int* row,
+                                     int* col,
+                                     size_t height,
+                                     size_t width,
+                                     size_t nnz, /* used to allocate space */
+                                     SparseValueType valueType, /*value type*/
+                                     SparseFormat format,
+                                     bool trans,
+                                     bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuSparseMatrix>(
+        data, row, col, height, width, nnz, valueType, format, trans);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(
+        data, row, col, height, width, nnz, valueType, format, trans);
+  }
+}
+
+MatrixPtr Matrix::createSparseMatrix(size_t height,
+                                     size_t width,
+                                     size_t nnz, /* used to allocate space */
+                                     SparseValueType valueType, /*value type*/
+                                     SparseFormat format,
+                                     bool trans,
+                                     bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, nnz, valueType, format, trans);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, nnz, valueType, format, trans);
+  }
+}
+
+MatrixPtr Matrix::create(MemoryHandlePtr memHandle,
+                         size_t height,
+                         size_t width,
+                         bool trans) {
+  if (auto gpuHandle = std::dynamic_pointer_cast<GpuMemoryHandle>(memHandle)) {
+    return std::make_shared<GpuMatrix>(gpuHandle, height, width, trans);
+  } else if (auto cpuHandle =
+                 std::dynamic_pointer_cast<CpuMemoryHandle>(memHandle)) {
+    return std::make_shared<CpuMatrix>(cpuHandle, height, width, trans);
+  } else {
+    LOG(FATAL) << "Wrong";
+    return nullptr;
+  }
+}
+
+MatrixPtr Matrix::create(size_t height, size_t width, bool trans, bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(height, width, trans);
+  } else {
+    return std::make_shared<CpuMatrix>(height, width, trans);
+  }
+}
+
+MatrixPtr Matrix::create(
+    real* data, size_t height, size_t width, bool trans, bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(data, height, width, trans);
+  } else {
+    return std::make_shared<CpuMatrix>(data, height, width, trans);
+  }
+}
+
+MatrixPtr Matrix::create(real* data,
+                         size_t height,
+                         size_t width,
+                         size_t stride,
+                         bool trans,
+                         bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(data, height, width, stride, trans);
+  } else {
+    return std::make_shared<CpuMatrix>(data, height, width, stride, trans);
+  }
+}
+
+MatrixPtr Matrix::createSparseMatrix(size_t height,
+                                     size_t width,
+                                     size_t nnz,
+                                     SparseValueType valueType,
+                                     bool trans,
+                                     bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, nnz, valueType, SPARSE_CSR, trans);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, nnz, valueType, SPARSE_CSR, trans);
+  }
+}
+
+void Matrix::resizeOrCreate(
+    MatrixPtr& matrix, size_t height, size_t width, bool trans, bool useGpu) {
+  if (!matrix) {
+    matrix = Matrix::create(height, width, trans, useGpu);
+  } else {
+    CHECK_EQ(matrix->useGpu(), useGpu);
+    matrix->resize(height, width);
+  }
+}
+
+void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix,
+                                        size_t height,
+                                        size_t width,
+                                        size_t nnz,
+                                        SparseValueType valueType,
+                                        SparseFormat format,
+                                        bool trans,
+                                        bool useGpu) {
+  if (!matrix) {
+    matrix = Matrix::createSparseMatrix(
+        height, width, nnz, valueType, format, trans, useGpu);
+  } else {
+    CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
+          dynamic_cast<GpuSparseMatrix*>(matrix.get()));
+    CHECK_EQ(matrix->useGpu(), useGpu);
+    matrix->resize(height, width, nnz, valueType, format);
+  }
+}
+
+void Matrix::reshape(size_t height, size_t width) {
+  CHECK(isContiguous());
+  CHECK(height_ * width_ == height * width);
+  height_ = height;
+  width_ = width;
+  stride_ = width_;
+}
+
+MatrixPtr Matrix::subMatrix(size_t startRow,
+                            size_t endRow,
+                            size_t startCol,
+                            size_t endCol) {
+  CHECK_LE(startRow, endRow);
+  CHECK_LE(endRow, getHeight());
+  CHECK_LE(startCol, endCol);
+  CHECK_LE(endCol, getWidth());
+
+  return Matrix::create(getData() + startRow * getStride() + startCol,
+                        endRow - startRow,
+                        endCol - startCol,
+                        getStride(),
+                        trans_,
+                        useGpu_);
+}
+
+void Matrix::setDiag(real value) {
+  CHECK(data_ != NULL);
+  CHECK_EQ(height_, width_);
+
+  zeroMem();
+  BaseMatrix diag(height_, 1, stride_ + 1, data_, false, useGpu_);
+  diag.assign(value);
+}
+
+GpuMatrix::GpuMatrix(size_t height, size_t width, bool trans)
+    : Matrix(std::make_shared<GpuMemoryHandle>(height * width * sizeof(real)),
+             height,
+             width,
+             trans,
+             true) {}
+
+GpuMatrix::~GpuMatrix() {}
+
+void GpuMatrix::zeroMem() {
+  CHECK(data_ != NULL);
+  zero();
+}
+
+void GpuMatrix::resetOne() {
+  CHECK(data_ != NULL);
+  one();
+}
+
+void GpuMatrix::resize(size_t newHeight, size_t newWidth) {
+  size_t newSize = newHeight * newWidth;
+  if (NULL == memoryHandle_.get() ||
+      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
+    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
+    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
+  }
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newSize;
+  stride_ = width_;
+}
+
+real GpuMatrix::getElement(size_t x, size_t y) const {
+  real elem = 0;
+  hl_memcpy_device2host(&elem, &data_[x * stride_ + y], sizeof(real));
+  return elem;
+}
+
+real GpuMatrix::getSum() {
+  CHECK(isContiguous());
+  real sum = 0.0f;
+  hl_vector_sum(data_, &sum, height_ * width_);
+  return sum;
+}
+
+real GpuMatrix::getMin() {
+  CHECK(isContiguous());
+  auto vec = GpuVector(height_ * width_, data_);
+  return vec.getMin();
+}
+
+real GpuMatrix::getMax() {
+  CHECK(isContiguous());
+  auto vec = GpuVector(height_ * width_, data_);
+  return vec.getMax();
+}
+
+void GpuMatrix::accumulateColSum(Matrix& src) {
+  CHECK_EQ(getWidth(), src.getWidth());
+  CHECK_EQ(getHeight(), (size_t)1);
+  sumCols(src, 1.0, 1.0);
+}
+
+real GpuMatrix::getAbsSum() {
+  CHECK(isContiguous());
+  real sum = 0.0f;
+  hl_vector_abs_sum(data_, &sum, height_ * width_);
+  return sum;
+}
+
+void GpuMatrix::copyFrom(const Matrix& src) {
+  CHECK(isContiguous());
+  CHECK(src.isContiguous());
+  CHECK(elementCnt_ == src.getElementCnt());
+
+  if (typeid(src) == typeid(CpuMatrix)) {
+    hl_memcpy_host2device(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
+  } else if (typeid(src) == typeid(GpuMatrix)) {
+    hl_memcpy_device2device(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void GpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  CHECK(isContiguous());
+  CHECK(src.isContiguous());
+  CHECK(elementCnt_ == src.getElementCnt());
+  hl_memcpy_async(this->getData(),
+                  const_cast<real*>(src.getData()),
+                  sizeof(real) * elementCnt_,
+                  stream);
+}
+
+void GpuMatrix::copyFrom(const real* hostSrc, size_t size) {
+  CHECK(isContiguous());
+  CHECK(size <= elementCnt_);
+  hl_memcpy_host2device(data_, const_cast<real*>(hostSrc), sizeof(real) * size);
+}
+
+void GpuMatrix::copyFrom(const real* hostSrc, const int64_t* seq) {
+  LOG(FATAL) << "not implemented";
+}
+
+void GpuMatrix::copyFrom(const IVector& src) {
+  CHECK(isContiguous());
+  CpuMatrix matrix(src.getSize(), 1, false);
+  matrix.copyFrom(src);
+  copyFrom(matrix);
+}
+
+void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(b.getWidth(), width);
+  real* dst = getData();
+  real* src = b.getData();
+  const int* index = rowIndex.getData();
+  hl_sequence2batch_copy(dst, src, index, width, height, true);
+}
+
+MatrixPtr GpuMatrix::clone(size_t height, size_t width, bool useGpu) {
+  CHECK(isContiguous());
+
+  if (height == 0 && width == 0) {
+    height = height_;
+    width = width_;
+  }
+
+  CHECK(width && height);
+
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(height, width);
+  } else {
+    return std::make_shared<CpuMatrix>(height, width);
+  }
+}
+
+MatrixPtr GpuMatrix::getTranspose() {
+  if (memoryHandle_.get() != NULL) {
+    MatrixPtr copy_T(
+        new GpuMatrix(std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
+                      height_,
+                      width_,
+                      true));
+    return copy_T;
+  } else {
+    MatrixPtr copy_T(new GpuMatrix(data_, height_, width_, true));
+    return copy_T;
+  }
+}
+
+void GpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
+  if (memAlloc) {
+    matTrans = std::make_shared<GpuMatrix>(width_, height_);
+  } else {
+    CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
+  }
+  real* dataTrans = matTrans->getData();
+  real* data = getData();
+  int lda = getStride();
+  int ldc = matTrans->getStride();
+
+  hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
+}
+
+void GpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+  if (memAlloc) {
+    matRot = std::make_shared<GpuMatrix>(width_, height_);
+  } else {
+    CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
+  }
+
+  real* dataRot = matRot->getData();
+  real* data = getData();
+  hl_matrix_rotate(data, dataRot, height_, width_, clockWise);
+}
+
+MatrixPtr GpuMatrix::getInverse() {
+  MatrixPtr matInv;
+  inverse(matInv, true);
+  return matInv;
+}
+
+void GpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
+  CHECK_EQ(height_, width_);
+
+  if (memAlloc) {
+    matInv = std::make_shared<GpuMatrix>(height_, width_);
+  } else {
+    CHECK(matInv != NULL);
+  }
+
+  real* data = getData();
+  real* dataInv = matInv->getData();
+  int lda = getStride();
+  int ldc = matInv->getStride();
+
+  hl_matrix_inverse(data, dataInv, height_, lda, ldc);
+}
+
+void GpuMatrix::addBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  BaseMatrix::addBias(b, scale);
+}
+
+void GpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  CHECK_LE(b.getWidth(), getWidth());
+  CHECK_EQ(getWidth() % b.getWidth(), 0UL);
+  hl_matrix_add_shared_bias(
+      getData(), b.getData(), b.getWidth(), getHeight(), getWidth(), scale);
+}
+
+void GpuMatrix::collectBias(Matrix& a, real scale) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(width_, a.getWidth());
+  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
+  if (!sMatPtr) {
+    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
+  } else {
+    real* data = getData();
+    hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
+    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
+  }
+#endif
+}
+
+void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(a.getWidth() % getWidth(), 0UL);
+  hl_matrix_collect_shared_bias(
+      getData(), a.getData(), getWidth(), a.getHeight(), a.getWidth(), scale);
+}
+
+void GpuMatrix::sequenceAvgForward(Matrix& a,
+                                   const IVector& startsPos,
+                                   int mode) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+
+  hl_sequence_avg_forward(dst, src, starts, height, width, mode);
+}
+
+void GpuMatrix::sequenceAvgBackward(Matrix& a,
+                                    const IVector& startsPos,
+                                    int mode) {
+  size_t height = a.getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+
+  hl_sequence_avg_backward(dst, src, starts, height, width, mode);
+}
+
+/* this = scaleAB*(a*b) +  scaleT*this */
+void GpuMatrix::mul(const GpuMatrix& a,
+                    const GpuMatrix& b,
+                    real scaleAB,
+                    real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+
+  if (!a.isTransposed() && !b.isTransposed()) {
+    CHECK_EQ(width_, b.width_);
+    CHECK_EQ(height_, a.height_);
+    CHECK_EQ(a.width_, b.height_);
+  } else if (a.isTransposed() && !b.isTransposed()) {
+    CHECK_EQ(width_, b.width_);
+    CHECK_EQ(height_, a.width_);
+    CHECK_EQ(a.height_, b.height_);
+  } else if (!a.isTransposed() && b.isTransposed()) {
+    CHECK_EQ(width_, b.height_);
+    CHECK_EQ(height_, a.height_);
+    CHECK_EQ(a.width_, b.width_);
+  } else {
+    LOG(FATAL) << "Is not supported";
+  }
+
+  real* A_d = a.data_;
+  real* B_d = b.data_;
+  real* C_d = data_;
+  int dimM = getHeight();
+  int dimN = getWidth();
+  int dimK = !a.isTransposed() ? a.width_ : a.height_;
+  int lda = a.getStride();
+  int ldb = b.getStride();
+  int ldc = getStride();
+  hl_trans_op_t transa = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
+  hl_trans_op_t transb = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
+
+  hl_matrix_mul(A_d,
+                transa,
+                B_d,
+                transb,
+                C_d,
+                dimM,
+                dimN,
+                dimK,
+                scaleAB,
+                scaleT,
+                lda,
+                ldb,
+                ldc);
+}
+
+void GpuMatrix::mul(const GpuSparseMatrix& a,
+                    const GpuMatrix& b,
+                    real scaleAB,
+                    real scaleT) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(isContiguous());
+  CHECK(b.isContiguous());
+  CHECK(b.useGpu_ == true) << "Matrix type are not equal";
+  CHECK(!trans_ && !b.trans_) << "not supported";
+
+  if (!a.trans_) {
+    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
+        << "Matrix dimensions are not equal";
+  } else {
+    CHECK(width_ == b.width_ && height_ == a.width_ && a.height_ == b.height_)
+        << "Matrix dimensions are not equal";
+  }
+  hl_trans_op_t transA = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
+  hl_sparse_matrix_s A_d = a.sMatrix_.get();
+  real* B_d = b.data_;
+  real* C_d = data_;
+  hl_matrix_csr_mul_dense(A_d,
+                          transA,
+                          B_d,
+                          HPPL_OP_N,
+                          C_d,
+                          height_,
+                          width_,
+                          b.height_,
+                          scaleAB,
+                          scaleT);
+#endif
+}
+
+void GpuMatrix::mul(const GpuMatrix& a,
+                    const GpuSparseMatrix& b,
+                    real scaleAB,
+                    real scaleT) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(isContiguous());
+  CHECK(a.isContiguous());
+  CHECK(a.useGpu_ == true) << "Matrix type are not equal";
+
+  hl_sparse_matrix_s B_d = b.sMatrix_.get();
+  real* A_d = a.data_;
+  real* C_d = data_;
+  hl_trans_op_t transB = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
+  if (!b.trans_) {
+    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
+        << "Matrix dimensions are not equal";
+  } else {
+    CHECK(width_ == b.height_ && height_ == a.height_ && a.width_ == b.width_)
+        << "Matrix dimensions are not equal";
+  }
+  if (b.format_ == SPARSE_CSC) {
+    hl_matrix_dense_mul_csc(A_d,
+                            HPPL_OP_N,
+                            B_d,
+                            transB,
+                            C_d,
+                            height_,
+                            width_,
+                            a.width_,
+                            scaleAB,
+                            scaleT);
+  } else {
+    hl_matrix_dense_mul_csr(A_d,
+                            HPPL_OP_N,
+                            B_d,
+                            transB,
+                            C_d,
+                            height_,
+                            width_,
+                            a.width_,
+                            scaleAB,
+                            scaleT);
+  }
+#endif
+}
+
+/* this = a*b */
+void GpuMatrix::mul(const Matrix& a, const Matrix& b) { mul(a, b, 1.0, 0.0); }
+
+void GpuMatrix::mul(const Matrix& a,
+                    const Matrix& b,
+                    real scaleAB,
+                    real scaleT) {
+  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
+  const auto a_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&a);
+  const auto b_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&b);
+
+  if (a_ptr && b_ptr) {
+    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
+  } else if (a_ptr_s && b_ptr) {
+    mul(*a_ptr_s, *b_ptr, scaleAB, scaleT);
+  } else if (a_ptr && b_ptr_s) {
+    mul(*a_ptr, *b_ptr_s, scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "Not supported";
+  }
+}
+
+/* this = this* b */
+void GpuMatrix::rightMul(Matrix& b) { rightMul(b, 1.0, 0.0); }
+
+/* this = scaleAB*(this*b) +  scaleT*this */
+void GpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
+  CHECK(dynamic_cast<GpuMatrix*>(&b));
+  CHECK(!isTransposed()) << "Not supported";
+  CHECK(!b.isTransposed()) << "Not supported";
+  mul(*this, *dynamic_cast<GpuMatrix*>(&b), scaleAB, scaleT);
+}
+
+/* this = a*this */
+void GpuMatrix::leftMul(Matrix& a) { leftMul(a, 1.0, 0.0); }
+
+/* this = scaleAB*(a*this) +  scaleT*this */
+void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(!isTransposed()) << "Not supported";
+  CHECK(!a.isTransposed()) << "Not supported";
+  mul(*dynamic_cast<GpuMatrix*>(&a), *this, scaleAB, scaleT);
+}
+
+void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(dynamic_cast<GpuMatrix*>(&table));
+  CHECK(table.useGpu());
+  CHECK(ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  hl_matrix_select_rows(a,
+                        stride_,
+                        table.getData(),
+                        table.stride_,
+                        index,
+                        numSamples,
+                        tableSize,
+                        dim);
+#endif
+}
+
+void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(dynamic_cast<GpuMatrix*>(&table));
+  CHECK(table.useGpu());
+  CHECK(ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  hl_matrix_add_to_rows(table.getData(),
+                        table.stride_,
+                        a,
+                        stride_,
+                        index,
+                        numSamples,
+                        tableSize,
+                        dim);
+#endif
+}
+
+void GpuMatrix::colMerge(Matrix& src) {
+  CHECK(src.height_ == height_);
+  if (!trans_ && !src.trans_) {
+    sumRows(src, /* scaleSum= */ 1, /* scaleDest= */ 0);
+  } else {
+    LOG(FATAL) << "Is not supported";
+  }
+}
+
+void GpuMatrix::rowSum(Matrix& sum) {
+  CHECK_EQ(sum.getHeight(), getHeight());
+  CHECK_EQ(sum.getWidth(), (size_t)1);
+
+  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
+}
+
+void GpuMatrix::rowMax(Matrix& max) {
+  CHECK_EQ(max.getHeight(), getHeight());
+  CHECK_EQ(max.getWidth(), (size_t)1);
+
+  max.maxRows(*this);
+}
+
+void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(maxVal.getWidth(), beam);
+
+  hl_matrix_top_k(maxVal.getData(),
+                  maxVal.getStride(),
+                  maxIds.getData(),
+                  this->getData(),
+                  this->getStride(),
+                  this->getWidth(),
+                  beam,
+                  numSamples);
+#endif
+}
+
+void GpuMatrix::colMax(Matrix& max) {
+  CHECK_EQ(max.getWidth(), getWidth());
+  CHECK_EQ(max.getHeight(), (size_t)1);
+
+  max.maxCols(*this);
+}
+
+void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+  LOG(FATAL) << "Is not supported";
+}
+
+void GpuMatrix::maxoutForward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
+                              size_t groups) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(dynamic_cast<GpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = getWidth();
+  size_t batchSize = getHeight();
+  const real* input = a.getData();
+  real* output = getData();
+  int* idForGpu = id.getData();
+
+  hl_maxout_forward(
+      input, output, idForGpu, batchSize, size, size / channels, groups);
+}
+
+void GpuMatrix::maxoutBackward(Matrix& a,
+                               IVector& id,
+                               size_t channels,
+                               size_t groups) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(dynamic_cast<GpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = a.getWidth();
+  size_t batchSize = getHeight();
+  real* input = getData();
+  const real* output = a.getData();
+  const int* idForGpu = id.getData();
+
+  hl_maxout_backward(
+      input, output, idForGpu, batchSize, size, size / channels, groups);
+}
+
+/*calulate the error of classification */
+void GpuMatrix::classificationError(Matrix& output,
+                                    IVector& label,
+                                    size_t topkSize) {
+  auto gpuOutput = dynamic_cast<GpuMatrix*>(&output);
+  auto gpuLabel = dynamic_cast<GpuIVector*>(&label);
+  size_t numSamples = this->getHeight();
+  GpuMatrixPtr gpuTopVal = std::make_shared<GpuMatrix>(numSamples, topkSize);
+  GpuIVectorPtr gpuTopIds = std::make_shared<GpuIVector>(numSamples * topkSize);
+
+  CHECK(gpuOutput && gpuLabel) << "Invalid argument pointer";
+  CHECK(gpuTopVal && gpuTopIds) << "Allocate GPU memory failed";
+  CHECK(gpuLabel->getSize() == numSamples) << "Vector size is not equal";
+  CHECK(numSamples == gpuOutput->getHeight() && this->getWidth() == 1)
+      << "Matrix dimensions are not equal";
+
+  size_t dim = gpuOutput->getWidth();
+  hl_matrix_classification_error(gpuTopVal->getData(),
+                                 gpuTopVal->getStride(),
+                                 gpuTopIds->getData(),
+                                 gpuOutput->getData(),
+                                 gpuOutput->getStride(),
+                                 dim,
+                                 topkSize,
+                                 numSamples,
+                                 gpuLabel->getData(),
+                                 this->getData());
+}
+
+/* copy -log(output[i * width + label]) to this->data[i] */
+void GpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
+  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&output);
+  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
+
+  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
+
+  CHECK(height_ == label.getSize() && width_ == 1 && height_ == output.height_)
+      << "Matrix dimensions are not equal";
+
+  real* A_d = output_ptr->data_;
+  real* C_d = data_;
+  int* label_d = label_ptr->getData();
+
+  hl_matrix_cross_entropy(A_d, C_d, label_d, height_, output.width_);
+}
+
+/* calculate the error of outputV according to label */
+void GpuMatrix::oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
+  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&outputV);
+  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
+
+  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
+
+  CHECK(height_ == output_ptr->height_ && width_ == output_ptr->width_)
+      << "Matrix dimensions are not equal";
+
+  real* output_d = output_ptr->data_;
+  real* grad_d = data_;
+  int* label_d = label_ptr->getData();
+
+  hl_matrix_cross_entropy_bp(grad_d, output_d, label_d, height_, width_);
+}
+
+void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                               IVector& label,
+                                               real alpha) {
+  LOG(FATAL) << "Not implemented";
+}
+
+void GpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                                 IVector& label,
+                                                 real alpha) {
+  LOG(FATAL) << "Not implemented";
+}
+
+void GpuMatrix::softmax(Matrix& output) {
+  CHECK(output.useGpu()) << "Matrix type are not equal";
+
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK(height == output.getHeight() && width == output.getWidth())
+      << "Matrix dimensions are not equal";
+
+  real* inputData = getData();
+  real* outputData = output.getData();
+  hl_matrix_softmax(inputData, outputData, height, width);
+}
+
+void GpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
+  CHECK_EQ(getWidth(), 1UL);
+  CHECK_EQ(output.getWidth(), 1UL);
+  CHECK(isContiguous());
+
+  real* inputData = getData();
+  real* outputData = output.getData();
+  auto starts = index.getData();
+  int numSequences = index.getSize() - 1;
+  hl_sequence_softmax_forward(inputData, outputData, starts, numSequences);
+}
+
+void GpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
+  CHECK(output.useGpu_ == true && sftmaxSum.useGpu_ == true)
+      << "Matrix type are not equal";
+
+  CHECK(height_ == output.height_ && width_ == output.width_ &&
+        height_ == sftmaxSum.height_)
+      << "Matrix dimensions are not equal";
+
+  real* output_d = output.data_;
+  real* sftmaxSum_d = sftmaxSum.data_;
+  real* grad_d = data_;
+  hl_matrix_softmax_derivative(grad_d, output_d, sftmaxSum_d, height_, width_);
+}
+
+void GpuMatrix::softmaxBackward(Matrix& outputV) {
+  CHECK(outputV.useGpu()) << "Matrix type are not equal";
+
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK(height == outputV.getHeight() && width == outputV.getWidth())
+      << "Matrix dimensions are not equal";
+
+  real* output_grad = getData();
+  real* output_value = outputV.getData();
+  hl_softmax_backward(output_value, output_grad, height, width);
+}
+
+void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
+  CHECK_EQ(label.getHeight(), height_);
+  CHECK_EQ(output.getHeight(), height_);
+  CHECK_EQ(label.getWidth(), output.getWidth());
+  CHECK_EQ((size_t)1, width_);
+
+  auto labelptr = dynamic_cast<GpuSparseMatrix*>(&label);
+  if (labelptr) {
+    LOG(FATAL) << "not supported: GpuSparseMatrix as label";
+  }
+
+  BaseMatrix::sumOfSquaredDiffs(output,
+                                label,
+                                /* scaleSum= */ 1,
+                                /* scaleDest= */ 1);
+}
+
+void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) {
+  add2(outputV, label, 1, 2, -2);
+}
+
+void GpuMatrix::tanh(Matrix& output) { BaseMatrix::tanh(output); }
+
+void GpuMatrix::tanhDerivative(Matrix& output) {
+  BaseMatrix::tanhDerivative(output);
+}
+
+void GpuMatrix::softrelu(Matrix& output) { BaseMatrix::softrelu(output); }
+
+void GpuMatrix::softreluDerivative(Matrix& output) {
+  BaseMatrix::softreluDerivative(output);
+}
+
+void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
+  BaseMatrix::scaledTanh(output, p1, p2);
+}
+
+void GpuMatrix::randomizeUniform() {
+  CHECK(isContiguous());
+  real* data = data_;
+  size_t size = height_ * width_;
+
+  hl_rand(data, size);
+}
+
+void GpuMatrix::print(std::ostream& os) const {
+  CHECK(isContiguous());
+  CpuMatrix cpuMat(getHeight(), getWidth());
+  cpuMat.copyFrom(*this);
+  cpuMat.print(os);
+}
+
+void GpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
+  CHECK(isContiguous());
+  CpuMatrix cpuMat(getHeight(), getWidth());
+  cpuMat.copyFrom(*this);
+  cpuMat.print(os, height, width);
+}
+
+void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
+  CHECK(isContiguous());
+  CHECK(height_ == refMat.getHeight());
+  CHECK(width_ == refMat.getWidth());
+  CpuMatrix cpuRef(height_, width_);
+  GpuMatrix gpuRef(height_, width_);
+  cpuRef.copyFrom(refMat);
+  gpuRef.copyFrom(*this);
+  size_t diffCnt = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      real a = gpuRef.getElement(i, j);
+      real b = cpuRef.getElement(i, j);
+      if (fabs(a - b) > 0.00001) {
+        ++diffCnt;
+        if (printDiff) {
+          os << "ref= " << a << "  check= " << b << std::endl;
+        }
+      }
+    }
+  }
+  LOG(INFO) << "the  diffCnt is " << diffCnt;
+}
+
+void GpuMatrix::upsampleForward(Matrix& input,
+                                Matrix& mask,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW) {
+  CHECK(input.useGpu_ == true) << "Matrix type are not equal";
+  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = input.getData();
+  real* maskData = mask.getData();
+  real* outData = data_;
+
+  size_t batch = input.getHeight();
+
+  CHECK(imgSizeH * imgSizeW * channels == input.getWidth());
+  CHECK(imgSizeH * imgSizeW * channels == mask.getWidth());
+  CHECK_EQ(batch, this->getHeight());
+  CHECK(width_ == outputH * outputW * channels);
+  hl_upsample_forward(inputData,
+                      maskData,
+                      batch,
+                      imgSizeH,
+                      imgSizeW,
+                      channels,
+                      outputH,
+                      outputW,
+                      outData);
+}
+
+void GpuMatrix::upsampleBackward(Matrix& outputGrad,
+                                 Matrix& mask,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW) {
+  CHECK(outputGrad.useGpu_ == true) << "Matrix type are not equal";
+  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
+
+  real* outputGradData = outputGrad.getData();
+  real* maskData = mask.getData();
+  real* inputGradData = data_;
+  size_t batch = outputGrad.getHeight();
+
+  CHECK(imgSizeH * imgSizeW == this->getWidth() / channels);
+  CHECK_EQ(batch, this->getHeight());
+  CHECK_EQ(channels * outputH * outputW, outputGrad.getWidth());
+  hl_upsample_backward(outputGradData,
+                       maskData,
+                       batch,
+                       imgSizeH,
+                       imgSizeW,
+                       channels,
+                       outputH,
+                       outputW,
+                       inputGradData);
+}
+
+void GpuMatrix::maxPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
+  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  real* maskData = NULL;
+  size_t frameNum = inputMat.getHeight();
+  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputH * outputW * channels);
+
+  if (maskMatP != NULL) {
+    CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
+    CHECK(outputH * outputW * channels == maskMatP->getWidth());
+    maskData = maskMatP->getData();
+  }
+
+  hl_maxpool_forward(frameNum,
+                     inputData,
+                     channels,
+                     imgSizeH,
+                     imgSizeW,
+                     outputH,
+                     outputW,
+                     sizeX,
+                     sizeY,
+                     strideH,
+                     strideW,
+                     paddingH,
+                     paddingW,
+                     data_,
+                     getStride(),
+                     maskData);
+}
+
+void GpuMatrix::maxPoolBackward(Matrix& inputMat,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                Matrix& outGrad,
+                                Matrix& outV,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
+  CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
+        outV.useGpu_ == true)
+      << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  real* outData = outV.getData();
+  real* outDiff = outGrad.getData();
+  size_t frameNum = inputMat.getHeight();
+  size_t channels = outV.getWidth() / outputH / outputW;
+  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(outGrad.getHeight() == outV.getHeight() &&
+        outGrad.getWidth() == outV.getWidth());
+
+  hl_maxpool_backward(frameNum,
+                      inputData,
+                      outData,
+                      outDiff,
+                      channels,
+                      imgSizeH,
+                      imgSizeW,
+                      outputH,
+                      outputW,
+                      sizeX,
+                      sizeY,
+                      strideH,
+                      strideW,
+                      paddingH,
+                      paddingW,
+                      scaleTargets,
+                      scaleOutput,
+                      data_,
+                      outGrad.getStride());
+}
+
+void GpuMatrix::avgPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW,
+                               bool excludeMode) {
+  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  size_t frameNum = inputMat.getHeight();
+  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputH * outputW * channels);
+
+  hl_avgpool_forward(frameNum,
+                     inputData,
+                     channels,
+                     imgSizeH,
+                     imgSizeW,
+                     outputH,
+                     outputW,
+                     sizeX,
+                     sizeY,
+                     strideH,
+                     strideW,
+                     paddingH,
+                     paddingW,
+                     data_,
+                     getStride(),
+                     excludeMode);
+}
+
+void GpuMatrix::avgPoolBackward(Matrix& outGrad,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW,
+                                bool excludeMode) {
+  CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  size_t frameNum = outGrad.getHeight();
+  size_t channels = outGrad.getWidth() / outputH / outputW;
+  CHECK(imgSizeH * imgSizeW * channels == width_);
+  CHECK(height_ == outGrad.getHeight());
+  CHECK(outGrad.getWidth() == outputH * outputW * channels);
+
+  hl_avgpool_backward(frameNum,
+                      outDiff,
+                      channels,
+                      imgSizeH,
+                      imgSizeW,
+                      outputH,
+                      outputW,
+                      sizeX,
+                      sizeY,
+                      strideH,
+                      strideW,
+                      paddingH,
+                      paddingW,
+                      scaleTargets,
+                      scaleOutput,
+                      data_,
+                      outGrad.getStride(),
+                      excludeMode);
+}
+
+void GpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 Matrix& maxPoolIdx,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  CHECK(inputMat.useGpu_) << "Matrix type are not correct";
+
+  real* inputData = inputMat.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t num = inputMat.getHeight();
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputD * outputH * outputW * channels);
+
+  hl_maxpool3D_forward(num,
+                       inputData,
+                       channels,
+                       imgSizeD,
+                       imgSizeH,
+                       imgSizeW,
+                       outputD,
+                       outputH,
+                       outputW,
+                       sizeZ,
+                       sizeY,
+                       sizeX,
+                       strideD,
+                       strideH,
+                       strideW,
+                       paddingD,
+                       paddingH,
+                       paddingW,
+                       getData(),
+                       maxPoolIdxData,
+                       getStride());
+}
+
+void GpuMatrix::maxPool3DBackward(Matrix& outGrad,
+                                  Matrix& maxPoolIdx,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  CHECK(outGrad.useGpu_ && maxPoolIdx.useGpu_) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t frameNum = getHeight();
+  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth());
+  CHECK(outGrad.getHeight() == maxPoolIdx.getHeight() &&
+        outGrad.getWidth() == maxPoolIdx.getWidth());
+
+  hl_maxpool3D_backward(frameNum,
+                        outDiff,
+                        channels,
+                        imgSizeD,
+                        imgSizeH,
+                        imgSizeW,
+                        outputD,
+                        outputH,
+                        outputW,
+                        sizeZ,
+                        sizeY,
+                        sizeX,
+                        strideD,
+                        strideH,
+                        strideW,
+                        paddingD,
+                        paddingH,
+                        paddingW,
+                        scaleTargets,
+                        scaleOutput,
+                        getData(),
+                        maxPoolIdxData,
+                        outGrad.getStride());
+}
+
+void GpuMatrix::avgPool3DForward(Matrix& inputMat,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  CHECK(inputMat.useGpu_) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  size_t frameNum = inputMat.getHeight();
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputD * outputH * outputW * channels);
+
+  hl_avgpool3D_forward(frameNum,
+                       inputData,
+                       channels,
+                       imgSizeD,
+                       imgSizeH,
+                       imgSizeW,
+                       outputD,
+                       outputH,
+                       outputW,
+                       sizeZ,
+                       sizeY,
+                       sizeX,
+                       strideD,
+                       strideH,
+                       strideW,
+                       paddingD,
+                       paddingH,
+                       paddingW,
+                       getData(),
+                       getStride());
+}
+
+void GpuMatrix::avgPool3DBackward(Matrix& outGrad,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  CHECK(outGrad.useGpu_) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  size_t frameNum = outGrad.getHeight();
+  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == width_);
+  CHECK(height_ == outGrad.getHeight());
+  CHECK(outGrad.getWidth() == outputD * outputH * outputW * channels);
+
+  hl_avgpool3D_backward(frameNum,
+                        outDiff,
+                        channels,
+                        imgSizeD,
+                        imgSizeH,
+                        imgSizeW,
+                        outputD,
+                        outputH,
+                        outputW,
+                        sizeZ,
+                        sizeY,
+                        sizeX,
+                        strideD,
+                        strideH,
+                        strideW,
+                        paddingD,
+                        paddingH,
+                        paddingW,
+                        scaleTargets,
+                        scaleOutput,
+                        getData(),
+                        outGrad.getStride());
+}
+
+void GpuMatrix::maxSequenceForward(Matrix& input,
+                                   const IVector& sequence,
+                                   IVector& index) {
+  CHECK(dynamic_cast<GpuMatrix*>(&input));
+  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
+  CHECK(dynamic_cast<GpuIVector*>(&index));
+
+  real* outData = getData();
+  real* inputData = input.getData();
+  const int* starts = sequence.getData();
+  int* maxIndex = index.getData();
+  size_t numSequences = getHeight();
+  size_t dim = getWidth();
+
+  CHECK_EQ(dim, input.getWidth());
+  CHECK_EQ(numSequences, sequence.getSize() - 1);
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  hl_max_sequence_forward(
+      inputData, starts, outData, maxIndex, numSequences, dim);
+}
+
+void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
+                                    const IVector& sequence,
+                                    IVector& index) {
+  CHECK(dynamic_cast<GpuMatrix*>(&outputGrad));
+  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
+  CHECK(dynamic_cast<GpuIVector*>(&index));
+
+  real* inputGrad = getData();
+  real* outGrad = outputGrad.getData();
+  int* maxIndex = index.getData();
+  size_t dim = getWidth();
+  size_t numSequences = sequence.getSize() - 1;
+
+  CHECK_EQ(dim, outputGrad.getWidth());
+  CHECK_EQ(numSequences, outputGrad.getHeight());
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
+}
+
+void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
+  CHECK(data.useGpu_ == true && W.useGpu_ == true)
+      << "Matrix type are not equal";
+  real* input = data.getData();
+  real* w = W.getData();
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  real* output = getData();
+  hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
+}
+
+void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
+  CHECK(oGrad.useGpu_ == true && data.useGpu_ == true)
+      << "Matrix type are not equal";
+  real* ograd = oGrad.getData();
+  real* input = data.getData();
+  real* wgrad = data_;
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  hl_param_relu_backward_w(
+      wgrad, ograd, input, numElements, numSamples, partial_sum);
+}
+
+void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
+  real* diff = data_;
+  real* input = data.getData();
+  real* ograd = oGrad.getData();
+  real* w = W.getData();
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  hl_param_relu_backward_diff(
+      ograd, input, w, diff, numElements, numSamples, partial_sum);
+}
+
+void GpuMatrix::addColumnVector(const Matrix& b) {
+  BaseMatrix::addColVector(const_cast<Matrix&>(b));
+}
+
+void GpuMatrix::bilinearForward(const Matrix& in,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {
+  CHECK(dynamic_cast<const GpuMatrix*>(&in));
+
+  const size_t outputW = getWidth();
+  const size_t outputH = getHeight();
+  const size_t inputW = in.getWidth();
+  const size_t inputH = in.getHeight();
+
+  real* outData = getData();
+  const real* inData = in.getData();
+
+  if (inImgH == outImgW && inImgW == outImgW) {
+    this->copyFrom(in);
+  } else {
+    hl_bilinear_forward(inData,
+                        inImgH,
+                        inImgW,
+                        inputH,
+                        inputW,
+                        outData,
+                        outImgH,
+                        outImgW,
+                        outputH,
+                        outputW,
+                        numChannels,
+                        ratioH,
+                        ratioW);
+  }
+}
+
+void GpuMatrix::bilinearBackward(const Matrix& out,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW) {
+  CHECK(dynamic_cast<const GpuMatrix*>(&out));
+
+  const size_t inputW = getWidth();
+  const size_t inputH = getHeight();
+  const size_t outputW = out.getWidth();
+  const size_t outputH = out.getHeight();
+
+  real* inGrad = getData();
+  const real* outGrad = out.getData();
+
+  if (outImgH == inImgH && outImgW == inImgW) {
+    this->add(const_cast<Matrix&>(out));
+  } else {
+    hl_bilinear_backward(inGrad,
+                         inImgH,
+                         inImgW,
+                         inputH,
+                         inputW,
+                         outGrad,
+                         outImgH,
+                         outImgW,
+                         outputH,
+                         outputW,
+                         numChannels,
+                         ratioH,
+                         ratioW);
+  }
+}
+
+void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
+  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
+  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
+
+  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
+  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
+  CHECK(height_ == outputPtr->height_ && width_ == 1 &&
+        outputPtr->width_ == labelPtr->getWidth() &&
+        outputPtr->height_ == labelPtr->getHeight())
+      << "Matrix dimensions are not equal";
+
+  real* output_d = outputPtr->data_;
+  real* entropy_d = data_;
+  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
+  hl_matrix_multi_binary_cross_entropy(
+      output_d, entropy_d, mat_d, height_, outputPtr->width_);
+#endif
+}
+
+void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
+  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
+  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
+
+  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
+  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
+  CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_ &&
+        outputPtr->width_ == labelPtr->getWidth() &&
+        outputPtr->height_ == labelPtr->getHeight())
+      << "Matrix dimensions are not equal";
+
+  real* output_d = outputPtr->data_;
+  real* grad_d = data_;
+  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
+  hl_matrix_multi_binary_cross_entropy_bp(
+      output_d, grad_d, mat_d, height_, width_);
+#endif
+}
+
+void GpuMatrix::vol2Col(real* dataSrc,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW) {
+  hl_matrix_vol2Col(dataSrc,
+                    channels,
+                    depth,
+                    height,
+                    width,
+                    filterD,
+                    filterH,
+                    filterW,
+                    strideD,
+                    strideH,
+                    strideW,
+                    paddingD,
+                    paddingH,
+                    paddingW,
+                    getData());
+}
+
+void GpuMatrix::col2Vol(real* dataDst,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW,
+                        real alpha,
+                        real beta) {
+  hl_matrix_col2Vol(dataDst,
+                    channels,
+                    depth,
+                    height,
+                    width,
+                    filterD,
+                    filterH,
+                    filterW,
+                    strideD,
+                    strideH,
+                    strideW,
+                    paddingD,
+                    paddingH,
+                    paddingW,
+                    getData(),
+                    alpha,
+                    beta);
+}
+
+/**
+ * CpuMatrix
+ */
+
+CpuMatrix::CpuMatrix(size_t height, size_t width, bool trans)
+    : Matrix(std::make_shared<CpuMemoryHandle>(height * width * sizeof(real)),
+             height,
+             width,
+             trans,
+             false) {}
+
+CpuMatrix::~CpuMatrix() {}
+
+void CpuMatrix::zeroMem() {
+  CHECK(data_ != NULL);
+  if (isContiguous()) {
+    memset(data_, 0, height_ * width_ * sizeof(real));
+  } else {
+    BaseMatrix::zero();
+  }
+}
+void CpuMatrix::resetOne() {
+  CHECK(data_ != NULL);
+  BaseMatrix::one();
+}
+
+void CpuMatrix::copyFrom(const Matrix& src) {
+  CHECK(isContiguous());
+  if (typeid(src) == typeid(GpuMatrix)) {
+    CHECK(src.isContiguous());
+    CHECK(elementCnt_ == src.getElementCnt());
+    hl_memcpy_device2host(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
+  } else if (typeid(src) == typeid(CpuMatrix) ||
+             typeid(src) == typeid(SharedCpuMatrix)) {
+    CHECK(src.isContiguous());
+    CHECK(elementCnt_ == src.getElementCnt());
+    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
+  } else if (typeid(src) == typeid(CpuSparseMatrix)) {
+    CHECK_GE(elementCnt_, src.getElementCnt());
+    copyFrom(dynamic_cast<CpuSparseMatrix&>(const_cast<Matrix&>(src)));
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void CpuMatrix::copyFrom(CpuSparseMatrix& src) {
+  CHECK(isContiguous());
+  CHECK(height_ == src.getHeight());
+  CHECK(width_ == src.getWidth());
+  memset(data_, 0, sizeof(real) * height_ * width_);
+  if (src.getValueType() == FLOAT_VALUE) {
+    if (src.getFormat() == SPARSE_CSC) {
+      int* rows = src.getRows();
+      real* vals = src.getValue();
+      for (size_t i = 0; i < width_; i++) {
+        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
+             j++) {
+          data_[rows[j] * width_ + i] = vals[j];
+        }
+      }
+    } else {
+      int* cols = src.getCols();
+      real* vals = src.getValue();
+      for (size_t i = 0; i < height_; i++) {
+        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
+             j++) {
+          data_[i * width_ + cols[j]] = vals[j];
+        }
+      }
+    }
+  } else {
+    if (src.getFormat() == SPARSE_CSC) {
+      int* rows = src.getRows();
+      for (size_t i = 0; i < width_; i++) {
+        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
+             j++) {
+          data_[rows[j] * width_ + i] = 1.0;
+        }
+      }
+    } else {
+      int* cols = src.getCols();
+      for (size_t i = 0; i < height_; i++) {
+        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
+             j++) {
+          data_[i * width_ + cols[j]] = 1.0;
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  CHECK(isContiguous());
+  CHECK(src.isContiguous());
+  CHECK(elementCnt_ == src.getElementCnt());
+  if (typeid(src) == typeid(GpuMatrix)) {
+    hl_memcpy_async(this->getData(),
+                    const_cast<real*>(src.getData()),
+                    sizeof(real) * elementCnt_,
+                    stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
+  } else if (typeid(src) == typeid(CpuMatrix)) {
+    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void CpuMatrix::copyFrom(const real* cpuSrc, size_t size) {
+  CHECK(isContiguous());
+  CHECK(size <= elementCnt_);
+  memcpy(data_, cpuSrc, sizeof(real) * size);
+}
+
+void CpuMatrix::copyFrom(const real* cpuSrc, const int64_t* seq) {
+  CHECK(isContiguous());
+  for (size_t i = 0; i < height_; i++) {
+    memcpy(data_ + i * width_, cpuSrc + seq[i] * width_, sizeof(real) * width_);
+  }
+}
+
+void CpuMatrix::copyFrom(const IVector& src) {
+  CHECK(isContiguous());
+  CHECK(elementCnt_ == src.getSize())
+      << "the src and dst should have same size.";
+  const int* cpuSrc = NULL;
+  IVectorPtr tmp;
+  if (src.useGpu()) {
+    CpuIVector tmp(src.getSize());
+    tmp.copyFrom(src);
+    cpuSrc = tmp.getData();
+  } else {
+    cpuSrc = src.getData();
+  }
+  for (size_t i = 0; i < elementCnt_; ++i) {
+    data_[i] = cpuSrc[i];
+  }
+}
+
+void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(b.getWidth(), width);
+  const int* index = rowIndex.getData();
+  for (size_t i = 0; i < height; i++) {
+    CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
+    real* src = b.getData() + index[i] * width;
+    real* dst = getData() + i * width;
+    memcpy(dst, src, sizeof(real) * width);
+  }
+}
+
+MatrixPtr CpuMatrix::clone(size_t height, size_t width, bool useGpu) {
+  CHECK(isContiguous());
+
+  if (height == 0 && width == 0) {
+    height = height_;
+    width = width_;
+  }
+
+  CHECK(width && height);
+
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(height, width);
+  } else {
+    return std::make_shared<CpuMatrix>(height, width);
+  }
+}
+
+void CpuMatrix::resize(size_t newHeight, size_t newWidth) {
+  size_t newSize = newHeight * newWidth;
+  if (NULL == memoryHandle_.get() ||
+      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
+    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
+    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
+  }
+
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newSize;
+  stride_ = width_;
+}
+
+real CpuMatrix::getElement(size_t x, size_t y) const {
+  return data_[x * stride_ + y];
+}
+
+real CpuMatrix::getSum() {
+  CHECK(isContiguous());
+  double sum = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      sum += data_[i * width_ + j];
+    }
+  }
+  return sum;
+}
+
+void CpuMatrix::accumulateColSum(Matrix& src) {
+  CHECK_EQ(getWidth(), src.getWidth());
+  CHECK_EQ(getHeight(), (size_t)1);
+
+  sumCols(src, /* scaleSum= */ 1, /* scaleDest= */ 1);
+}
+
+real CpuMatrix::getAbsSum() {
+  CHECK(isContiguous());
+  double sum = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      sum += fabs(data_[i * width_ + j]);
+    }
+  }
+  return sum;
+}
+
+MatrixPtr CpuMatrix::getTranspose() {
+  if (memoryHandle_.get() != NULL) {
+    return std::make_shared<CpuMatrix>(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
+        height_,
+        width_,
+        true);
+  } else {
+    MatrixPtr copy_T(new CpuMatrix(data_, height_, width_, true));
+    return copy_T;
+  }
+}
+
+void CpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
+  if (memAlloc) {
+    matTrans = std::make_shared<CpuMatrix>(width_, height_);
+  } else {
+    CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
+  }
+  real* dataTrans = matTrans->getData();
+  real* data = getData();
+  int lda = getStride();
+  int ldc = matTrans->getStride();
+
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      dataTrans[j * ldc + i] = data[i * lda + j];
+    }
+  }
+}
+
+void CpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+  if (memAlloc) {
+    matRot = std::make_shared<CpuMatrix>(width_, height_);
+  } else {
+    CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
+  }
+  real* dataRot = matRot->getData();
+  real* data = getData();
+
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      if (clockWise) {
+        dataRot[j * height_ + i] = data[(height_ - i - 1) * width_ + j];
+      } else {
+        dataRot[j * height_ + i] = data[i * width_ + (width_ - j - 1)];
+      }
+    }
+  }
+}
+
+MatrixPtr CpuMatrix::getInverse() {
+  MatrixPtr matInv;
+  inverse(matInv, true);
+  return matInv;
+}
+
+void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
+  CHECK_EQ(height_, width_);
+
+  if (memAlloc) {
+    matInv = std::make_shared<CpuMatrix>(height_, width_);
+  } else {
+    CHECK(matInv != NULL);
+  }
+
+  CHECK_EQ(height_, matInv->getHeight());
+  CHECK_EQ(width_, matInv->getWidth());
+  matInv->copyFrom(*this);
+
+  real* data = getData();
+  real* dataInv = matInv->getData();
+  int ldc = matInv->getStride();
+
+  if (height_ == 1) {
+    CHECK_NE(*data, 0);
+    *dataInv = 1.0 / (*data);
+    return;
+  }
+
+  /* Compute the LU decomposition of the matrix */
+  std::vector<int> ipiv(height_);
+  CBLAS_ORDER order = (matInv->isTransposed() ? CblasColMajor : CblasRowMajor);
+  int info = getrf<real>(order, height_, height_, dataInv, ldc, ipiv.data());
+  CHECK_EQ(info, 0);
+
+  /* Compute the inverse of the matrix given its LU decompsotion */
+  info = getri<real>(order, height_, dataInv, ldc, ipiv.data());
+  CHECK_EQ(info, 0);
+}
+
+void CpuMatrix::upsampleForward(Matrix& input,
+                                Matrix& mask,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW) {
+  real* inputData = input.getData();
+  real* maskData = mask.getData();
+  real* outData = data_;
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  size_t batch = input.getHeight();
+  CHECK(inLength == input.getWidth() / channels);
+  CHECK_EQ(batch, this->getHeight());
+  CHECK_EQ(channels * outLength, this->getWidth());
+
+  for (size_t k = 0; k < batch; k++) {
+    for (size_t c = 0; c < channels; c++) {
+      for (size_t i = 0; i < inLength; i++) {
+        size_t out_index = static_cast<int>(maskData[i]);
+        if (out_index >= outLength) {
+          LOG(FATAL) << "upsample index " << out_index << " out of range.";
+        }
+        outData[out_index] = inputData[i];
+      }
+      inputData += inLength;
+      maskData += inLength;
+      outData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::upsampleBackward(Matrix& outputGrad,
+                                 Matrix& mask,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW) {
+  real* outputGradData = outputGrad.getData();
+  real* maskData = mask.getData();
+  real* inputGradData = data_;
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  size_t batch = outputGrad.getHeight();
+  CHECK(inLength == this->getWidth() / channels);
+  CHECK_EQ(batch, this->getHeight());
+  CHECK_EQ(channels * outLength, outputGrad.getWidth());
+
+  for (size_t k = 0; k < batch; k++) {
+    for (size_t c = 0; c < channels; c++) {
+      for (size_t i = 0; i < inLength; i++) {
+        size_t out_index = static_cast<int>(maskData[i]);
+        if (out_index >= outLength) {
+          LOG(FATAL) << "upsample index " << out_index << " out of range.";
+        }
+        inputGradData[i] = outputGradData[out_index];
+      }
+      inputGradData += inLength;
+      maskData += inLength;
+      outputGradData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::maxPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
+  real* inputData = inputMat.getData();
+  real* outData = data_;
+  real* maskData = NULL;
+  size_t num = inputMat.getHeight();
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  CHECK(inLength == inputMat.getWidth() / channels);
+  CHECK_EQ(num, this->getHeight());
+  CHECK_EQ(channels * outLength, this->getWidth());
+  size_t outStride = getStride();
+
+  if (maskMatP != NULL) {
+    maskData = maskMatP->getData();
+    CHECK_EQ(channels * outLength, maskMatP->getWidth());
+  }
+
+  /* pool max one by one */
+  for (size_t n = 0; n < num; ++n) {  // frame by frame
+    if (!isContiguous()) {
+      outData = data_ + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {  // channel by channel
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = hstart + sizeY;
+        hstart = hstart < 0 ? 0 : hstart;
+        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          int wstart = pw * strideW - paddingW;
+          int wend = wstart + sizeX;
+          wstart = wstart < 0 ? 0 : wstart;
+          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
+
+          real maxval = -(real)FLT_MAX;
+          int max_index = -1;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              if (maxval < inputData[h * imgSizeW + w]) {
+                maxval = inputData[h * imgSizeW + w];
+                max_index = h * imgSizeW + w;
+              }
+            }
+          }
+
+          outData[ph * outputW + pw] = maxval;
+          if (maskData != NULL) maskData[ph * outputW + pw] = max_index;
+        }
+      }
+      // compute offset
+      inputData += inLength;
+      outData += outLength;
+
+      if (maskData != NULL) maskData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::maxPoolBackward(Matrix& image,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                Matrix& outGrad,
+                                Matrix& outV,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
+  size_t num = image.getHeight();
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  size_t channels = size_t(width_ / inLength);
+  CHECK(image.getWidth() == inLength * channels);
+  CHECK(image.getHeight() == height_ && image.getWidth() == width_);
+  CHECK(outV.getHeight() == outGrad.getHeight() &&
+        outV.getWidth() == outGrad.getWidth());
+
+  real* tgtGrad = data_;
+  real* inData = image.getData();
+  real* otData = outV.getData();
+  real* otGrad = outGrad.getData();
+
+  size_t outStride = outV.getStride();
+  real* origOutData = otData;
+  real* origOutGrad = otGrad;
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!outV.isContiguous()) {
+      otData = origOutData + n * outStride;
+      otGrad = origOutGrad + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = std::min(hstart + sizeY, imgSizeH);
+        hstart = std::max(hstart, 0);
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          int wstart = pw * strideW - paddingW;
+          int wend = std::min(wstart + sizeX, imgSizeW);
+          wstart = std::max(wstart, 0);
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              tgtGrad[h * imgSizeW + w] =
+                  scaleTargets * tgtGrad[h * imgSizeW + w] +
+                  scaleOutput * otGrad[ph * outputW + pw] *
+                      (inData[h * imgSizeW + w] == otData[ph * outputW + pw]);
+            }
+          }
+        }
+      }
+      // offset
+      inData += inLength;
+      tgtGrad += inLength;
+      otData += outLength;
+      otGrad += outLength;
+    }
+  }
+}
+
+void CpuMatrix::avgPoolForward(Matrix& input,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW,
+                               bool excludeMode) {
+  // The main loop
+  size_t num = input.getHeight();
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  CHECK(inLength * channels == input.getWidth());
+  CHECK(outLength * channels * num == height_ * width_);
+  real* tgtData = data_;
+  real* inData = input.getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!isContiguous()) {
+      tgtData = data_ + n * getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = std::min(hstart + sizeY, imgSizeH);
+        hstart = std::max(hstart, 0);
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          int wstart = pw * strideW - paddingW;
+          int wend = std::min(wstart + sizeX, imgSizeW);
+          wstart = std::max(wstart, 0);
+          tgtData[ph * outputW + pw] = 0;  // clear
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
+            }
+          }
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+          CHECK(poolSize);
+          tgtData[ph * outputW + pw] /= poolSize;
+        }
+      }
+      // compute offset
+      inData += inLength;
+      tgtData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::avgPoolBackward(Matrix& input,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW,
+                                bool excludeMode) {
+  size_t num = input.getHeight();
+  size_t channels = input.getWidth() / outputH / outputW;
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  CHECK(inLength * channels == getWidth());
+  real* inData = input.getData();
+  real* outData = getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!input.isContiguous()) {
+      inData = input.getData() + n * input.getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = std::min(hstart + sizeY, imgSizeH);
+        hstart = std::max(hstart, 0);
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          int wstart = pw * strideW - paddingW;
+          int wend = std::min(wstart + sizeX, imgSizeW);
+          wstart = std::max(wstart, 0);
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+          CHECK(poolSize);
+
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              outData[h * imgSizeW + w] += inData[ph * outputW + pw] / poolSize;
+            }
+          }
+        }
+      }
+      // offset
+      outData += inLength;
+      inData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 Matrix& maxPoolIdx,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  real* inputData = inputMat.getData();
+  real* outData = getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t num = inputMat.getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  CHECK(inLength == inputMat.getWidth() / channels);
+  CHECK_EQ(num, this->getHeight());
+  CHECK_EQ(channels * outLength, this->getWidth());
+  size_t outStride = getStride();
+
+  /* initialize the data_ */
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      outData[(i)*outStride + j] = -(real)FLT_MAX;
+      maxPoolIdxData[(i)*outStride + j] = -1;
+    }
+  }
+
+  /* pool max one by one */
+  for (size_t n = 0; n < num; ++n) {  // frame by frame
+    if (!isContiguous()) {
+      outData = getData() + n * outStride;
+      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {  // channel by channel
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        int dstart = pd * strideD - paddingD;
+        int dend = std::min(dstart + sizeZ, imgSizeD);
+        dstart = std::max(dstart, 0);
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          int hstart = ph * strideH - paddingH;
+          int hend = std::min(hstart + sizeY, imgSizeH);
+          hstart = std::max(hstart, 0);
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int wstart = pw * strideW - paddingW;
+            int wend = std::min(wstart + sizeX, imgSizeW);
+            wstart = std::max(wstart, 0);
+            int maxIdx = -1;
+            real maxOutData = outData[(pd * outputH + ph) * outputW + pw];
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  if (maxOutData <
+                      inputData[(d * imgSizeH + h) * imgSizeW + w]) {
+                    maxOutData = inputData[(d * imgSizeH + h) * imgSizeW + w];
+                    maxIdx = (d * imgSizeH + h) * imgSizeW + w;
+                  }
+                }
+              }
+            }
+            outData[(pd * outputH + ph) * outputW + pw] = maxOutData;
+            maxPoolIdxData[(pd * outputH + ph) * outputW + pw] = maxIdx;
+          }
+        }
+      }
+      // compute offset
+      inputData += inLength;
+      outData += outLength;
+      maxPoolIdxData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::maxPool3DBackward(Matrix& outGrad,
+                                  Matrix& maxPoolIdx,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  size_t num = getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  size_t channels = size_t(width_ / inLength);
+  CHECK(maxPoolIdx.getHeight() == outGrad.getHeight() &&
+        maxPoolIdx.getWidth() == outGrad.getWidth());
+
+  real* tgtGrad = getData();
+  real* otGrad = outGrad.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t outStride = outGrad.getStride();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!outGrad.isContiguous()) {
+      otGrad = outGrad.getData() + n * outStride;
+      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            const size_t index = (pd * outputH + ph) * outputW + pw;
+            const size_t tgtIdx = static_cast<size_t>(maxPoolIdxData[index]);
+            tgtGrad[tgtIdx] =
+                scaleTargets * tgtGrad[tgtIdx] + scaleOutput * otGrad[index];
+          }
+        }
+      }
+      // offset
+      tgtGrad += inLength;
+      otGrad += outLength;
+      maxPoolIdxData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::avgPool3DForward(Matrix& input,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  // The main loop
+  size_t num = input.getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  CHECK(inLength * channels == input.getWidth());
+  CHECK(outLength * channels * num == height_ * width_);
+  real* tgtData = getData();
+  real* inData = input.getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!isContiguous()) {
+      tgtData = data_ + n * getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        int dstart = pd * strideD - paddingD;
+        int dend = std::min(dstart + sizeZ, imgSizeD);
+        dstart = std::max(dstart, 0);
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          int hstart = ph * strideH - paddingH;
+          int hend = std::min(hstart + sizeY, imgSizeH);
+          hstart = std::max(hstart, 0);
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int wstart = pw * strideW - paddingW;
+            int wend = std::min(wstart + sizeX, imgSizeW);
+            wstart = std::max(wstart, 0);
+
+            tgtData[(pd * outputH + ph) * outputW + pw] = 0;  // clear
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  tgtData[(pd * outputH + ph) * outputW + pw] +=
+                      inData[(d * imgSizeH + h) * imgSizeW + w];
+                }
+              }
+            }
+            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            CHECK(poolSize);
+            tgtData[(pd * outputH + ph) * outputW + pw] /= poolSize;
+          }
+        }
+      }
+      // compute offset
+      inData += inLength;
+      tgtData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::avgPool3DBackward(Matrix& input,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  size_t num = input.getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  size_t channels = input.getWidth() / outLength;
+  CHECK(inLength * channels == getWidth());
+  real* inData = input.getData();
+  real* outData = getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!input.isContiguous()) {
+      inData = input.getData() + n * input.getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        int dstart = pd * strideD - paddingD;
+        int dend = std::min(dstart + sizeZ, imgSizeD);
+        dstart = std::max(dstart, 0);
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          int hstart = ph * strideH - paddingH;
+          int hend = std::min(hstart + sizeY, imgSizeH);
+          hstart = std::max(hstart, 0);
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int wstart = pw * strideW - paddingW;
+            int wend = std::min(wstart + sizeX, imgSizeW);
+            wstart = std::max(wstart, 0);
+            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            CHECK(poolSize);
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  outData[(d * imgSizeH + h) * imgSizeW + w] +=
+                      inData[(pd * outputH + ph) * outputW + pw] / poolSize;
+                }
+              }
+            }
+          }
+        }
+      }
+      // offset
+      outData += inLength;
+      inData += outLength;
+    }
+  }
+}
+
+/**
+ * Input: one or more sequences. Each sequence contains some instances.
+ * Output: output size is the number of input sequences (NOT input instances).
+ * output[i] is set to max_{for each instance in this sequence}{input[i]}
+ */
+void CpuMatrix::maxSequenceForward(Matrix& input,
+                                   const IVector& sequence,
+                                   IVector& index) {
+  CHECK(dynamic_cast<CpuMatrix*>(&input));
+  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
+  CHECK(dynamic_cast<CpuIVector*>(&index));
+
+  real* outData = getData();
+  real* inputData = input.getData();
+  const int* starts = sequence.getData();
+  int* maxIndex = index.getData();
+  size_t numSequences = getHeight();
+  size_t dim = getWidth();
+
+  CHECK_EQ(dim, input.getWidth());
+  CHECK_EQ(numSequences, sequence.getSize() - 1);
+  CHECK_EQ(starts[numSequences], (int)input.getHeight());
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
+    // current sequence, loop for each input instance
+    // (1) first instance: do not need compare, copy value to outV directly
+    for (size_t k = 0; k < dim; ++k) {
+      outData[sequenceId * dim + k] = inputData[starts[sequenceId] * dim + k];
+      maxIndex[sequenceId * dim + k] = starts[sequenceId];
+    }
+    // (2) other instance in same sequence
+    for (int insId = starts[sequenceId] + 1; insId < starts[sequenceId + 1];
+         ++insId) {
+      // insId is the index on all instances
+      for (size_t k = 0; k < dim; ++k) {
+        // for each dim
+        if (inputData[insId * dim + k] > outData[sequenceId * dim + k]) {
+          // update max value and record index
+          outData[sequenceId * dim + k] = inputData[insId * dim + k];
+          maxIndex[sequenceId * dim + k] = insId;
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
+                                    const IVector& sequence,
+                                    IVector& index) {
+  CHECK(dynamic_cast<CpuMatrix*>(&outputGrad));
+  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
+  CHECK(dynamic_cast<CpuIVector*>(&index));
+
+  real* inputGrad = getData();
+  real* outGrad = outputGrad.getData();
+  int* maxIndex = index.getData();
+  size_t dim = getWidth();
+  size_t numSequences = sequence.getSize() - 1;
+
+  CHECK_EQ(dim, outputGrad.getWidth());
+  CHECK_EQ(numSequences, outputGrad.getHeight());
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
+    // current sequence
+    for (size_t j = 0; j < dim; ++j) {
+      // each dim
+      int insId = maxIndex[sequenceId * dim + j];
+      inputGrad[insId * dim + j] += outGrad[sequenceId * dim + j];
+    }
+  }
+}
+
+inline void vecAddTo(real* a, const real* b, size_t len) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i] += b[i];
+  }
+}
+
+inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i] += scaleB * b[i];
+  }
+}
+
+inline void colVecAddTo(
+    real* a, const real* b, size_t len, size_t aWidth, size_t bWidth) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i * aWidth] += b[i * bWidth];
+  }
+}
+
+inline void colVecAddTo(
+    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i * aWidth] += b[i * bWidth] * c;
+  }
+}
+
+void CpuMatrix::addBias(Matrix& b, real scale) {
+  CHECK(b.useGpu_ == false) << "Matrix type are not equal";
+
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  CHECK_EQ(width_, b.getWidth());
+  real* aData = getData();
+  real* bData = b.getData();
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+
+  if (scale == 1 && getStride() % 32 == 0) {  // use libaddto
+    // @TODO(yuyang18) Make input addr can be unaligned.
+    // So merge this if and else
+    CHECK_EQ((size_t)aData % 32, 0UL);
+    CHECK_EQ((size_t)bData % 32, 0UL);
+    for (size_t i = 0; i < numSamples; i++) {
+      simd::addTo(aData + i * getStride(), bData, dim);
+    }
+  } else {
+    for (size_t i = 0; i < numSamples; i++) {
+      for (size_t j = 0; j < dim; j++) {
+        aData[i * getStride() + j] += scale * bData[j];
+      }
+    }
+  }
+}
+
+void CpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  real* aData = getData();
+  real* bData = b.getData();
+  size_t numSamples = getHeight();
+  size_t channel = b.getWidth();
+  CHECK_EQ(getWidth() % channel, 0UL);
+  size_t dim = getWidth() / channel;
+
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        aData[i * getStride() + c * dim + j] += scale * bData[c];
+      }
+    }
+  }
+}
+
+void CpuMatrix::collectBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(width_, a.getWidth());
+  CpuSparseMatrix* aptr = dynamic_cast<CpuSparseMatrix*>(&a);
+  if (!aptr) {
+    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
+  } else {
+    size_t nnz = aptr->getElementCnt();
+    int* cols = aptr->getCols();
+    real* A = aptr->getValue();
+    real* B = getData();
+    for (size_t i = 0; i < nnz; i++) {
+      B[cols[i]] += scale * A[i];
+    }
+  }
+}
+
+void CpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  real* B = getData();
+  real* A = a.getData();
+  size_t numSamples = a.getHeight();
+  size_t channel = getWidth();
+  CHECK_EQ(a.getWidth() % channel, 0UL);
+  size_t dim = a.getWidth() / channel;
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        B[c] += scale * A[i * channel * dim + c * dim + j];
+      }
+    }
+  }
+}
+
+void CpuMatrix::sequenceAvgForward(Matrix& a,
+                                   const IVector& startsPos,
+                                   int mode) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
+  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
+  for (size_t i = 0; i < height; i++) {
+    int sequenceLength = starts[i + 1] - starts[i];
+    if (0 == sequenceLength) {
+      // empty sequence
+      continue;
+    }
+    outMtx->setData(dst + i * width);
+    dataMtx->setData(src + starts[i] * width, sequenceLength, width);
+    if (mode == 0) {
+      // plain average
+      outMtx->sumCols(*dataMtx,
+                      (real)1 / (real)sequenceLength,
+                      /* scaleDest= */ 1);
+    } else if (mode == 1) {
+      // sum instead of average
+      outMtx->sumCols(*dataMtx, /* scaleSum= */ 1, /* scaleDest= */ 1);
+    } else if (mode == 2) {
+      // divide by square root of sequenceLength
+      outMtx->sumCols(*dataMtx,
+                      (real)1 / std::sqrt(sequenceLength),
+                      /* scaleDest= */ 1);
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+void CpuMatrix::sequenceAvgBackward(Matrix& a,
+                                    const IVector& startsPos,
+                                    int mode) {
+  size_t height = a.getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
+  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
+  for (size_t i = 0; i < height; ++i) {
+    int sequenceLength = starts[i + 1] - starts[i];
+    if (0 == sequenceLength) {
+      // empty sequence
+      continue;
+    }
+    outMtx->setData(dst + starts[i] * width, sequenceLength, width);
+    dataMtx->setData(src + i * width);
+    if (mode == 0) {
+      // plain average
+      outMtx->addBias(*dataMtx, 1.0f / sequenceLength);
+    } else if (mode == 1) {
+      // sum instead of average
+      outMtx->addBias(*dataMtx, 1.0f);
+    } else if (mode == 2) {
+      // divide by square root of sequenceLength
+      outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+/* this = scaleAB*(a*b) + scaleT*this*/
+void CpuMatrix::mul(const Matrix& a,
+                    const Matrix& b,
+                    real scaleAB,
+                    real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
+  const auto a_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&a);
+  const auto b_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&b);
+
+  if (a_ptr && b_ptr) {
+    mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, scaleAB, scaleT);
+  } else if (a_ptr_s && b_ptr) {
+    mul((CpuSparseMatrix*)a_ptr_s, (CpuMatrix*)b_ptr, scaleAB, scaleT);
+  } else if (a_ptr && b_ptr_s) {
+    mul((CpuMatrix*)a_ptr, (CpuSparseMatrix*)b_ptr_s, scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "Not supported";
+  }
+}
+
+void CpuMatrix::mul(CpuSparseMatrix* a,
+                    CpuMatrix* b,
+                    real scaleAB,
+                    real scaleT) {
+  if (dynamic_cast<CacheRowCpuMatrix*>(b)) {
+    return mul(a, dynamic_cast<CacheRowCpuMatrix*>(b), this, scaleAB, scaleT);
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(b)) {
+    return mul(a, dynamic_cast<SparseRowCpuMatrix*>(b), this, scaleAB, scaleT);
+  } else {
+    return mul(a, b, this, scaleAB, scaleT);
+  }
+}
+
+void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+
+  size_t a_col, b_col, a_row, b_row;
+  bool a_trans, b_trans;
+  if (!a->isTransposed()) {
+    a_col = a->getWidth();
+    a_row = a->getHeight();
+    a_trans = false;
+  } else {
+    a_col = a->getHeight();
+    a_row = a->getWidth();
+    a_trans = true;
+  }
+  if (!b->isTransposed()) {
+    b_col = b->getWidth();
+    b_row = b->getHeight();
+    b_trans = false;
+  } else {
+    b_col = b->getHeight();
+    b_row = b->getWidth();
+    b_trans = true;
+  }
+
+  CHECK_EQ(a_col, b_row);
+  CHECK_EQ(a_row, getHeight());
+  CHECK_EQ(b_col, getWidth());
+
+  real* A = a->getData();
+  real* B = b->getData();
+  real* C = getData();
+
+  int M = getHeight();
+  int N = getWidth();
+  int K = a_col;
+  int lda = a->getStride();
+  int ldb = b->getStride();
+  int ldc = getStride();
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
+      a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
+}
+
+void CpuMatrix::mul(
+    CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB, real scaleT) {
+  CHECK(!c->isTransposed()) << "Not supported";
+  CHECK_EQ(c->getValueType(), FLOAT_VALUE);
+
+  real* A = a->getData();
+  real* B = b->getData();
+  real* C = c->getValue();
+  int* rows = c->getRows();
+  int* cols = c->getCols();
+  size_t height = c->getHeight();
+  size_t width = c->getWidth();
+  if (scaleT == 0) {
+    c->zeroMem();
+  }
+
+  if (!a->isTransposed() && !b->isTransposed()) {
+    size_t m = a->getWidth();
+    CHECK_EQ(b->getHeight(), m);
+    CHECK_EQ(a->getHeight(), height);
+    CHECK_EQ(b->getWidth(), width);
+    if (c->getFormat() == SPARSE_CSC) {
+      for (size_t i = 0; i < width; i++) {
+        size_t start = c->getColStartIdx(i);
+        size_t end = c->getColStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t rowIdx = rows[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[rowIdx * m + k] * B[k * width + i];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    } else {
+      for (size_t i = 0; i < height; i++) {
+        size_t start = c->getRowStartIdx(i);
+        size_t end = c->getRowStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t colIdx = cols[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[i * m + k] * B[k * width + colIdx];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    }
+  } else if (a->isTransposed() && !b->isTransposed()) {
+    size_t m = a->getHeight();
+    CHECK_EQ(m, b->getHeight());
+    CHECK_EQ(b->getWidth(), width);
+    CHECK_EQ(a->getWidth(), height);
+
+    if (c->getFormat() == SPARSE_CSC) {
+      for (size_t i = 0; i < width; i++) {
+        size_t start = c->getColStartIdx(i);
+        size_t end = c->getColStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t rowIdx = rows[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[k * height + rowIdx] * B[k * width + i];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    } else {
+      for (size_t i = 0; i < height; i++) {
+        int start = c->getRowStartIdx(i);
+        int end = c->getRowStartIdx(i + 1);
+        for (int j = start; j < end; j++) {
+          real sum = 0;
+          size_t colIdx = cols[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[k * height + i] * B[k * width + colIdx];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    }
+  } else if (!a->isTransposed() && b->isTransposed()) {
+    size_t m = a->getWidth();
+    CHECK_EQ(b->getWidth(), m);
+    CHECK_EQ(a->getHeight(), height);
+    CHECK_EQ(b->getHeight(), width);
+    if (c->getFormat() == SPARSE_CSR) {
+      for (size_t i = 0; i < height; i++) {
+        size_t start = c->getRowStartIdx(i);
+        size_t end = c->getRowStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t colIdx = cols[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[i * m + k] * B[colIdx * m + k];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    } else {
+      LOG(FATAL) << "Not supported csc format "
+                    "when a is not trans and b is trans";
+    }
+  } else {
+    LOG(FATAL) << "Not supported";
+  }
+}
+
+void CpuMatrix::mul(CpuMatrix* a,
+                    CpuSparseMatrix* b,
+                    real scaleAB,
+                    real scaleT) {
+  CHECK(!trans_) << "Not supported";
+  CHECK(!a->isTransposed()) << "Not supported";
+  CHECK(scaleT == 0 || scaleT == 1);
+
+  // TODO(yuyang18): Maybe bug implementation here
+  CHECK_EQ(scaleAB, static_cast<real>(1.0));
+
+  real* A = a->getData();
+  real* B = b->getValue();
+  real* C = getData();
+  int* rows = b->getRows();
+  int* cols = b->getCols();
+
+  if (scaleT == 0) {
+    zeroMem();
+  }
+  if (b->getFormat() == SPARSE_CSC) {
+    if (!b->isTransposed()) {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), m);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), width_);
+
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t j = 0; j < b->getWidth(); ++j) {
+          int start = b->getColStartIdx(j);
+          int end = b->getColStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(C + j, A + rows[i], height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t j = 0; j < b->getWidth(); ++j) {
+          int start = b->getColStartIdx(j);
+          int end = b->getColStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(
+                C + j, A + rows[i], B[i], height_, width_, a->getWidth());
+          }
+        }
+      }
+    } else /*if (b->isTransposed())*/ {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), width_);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), m);
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t i = 0; i < b->getWidth(); ++i) {
+          int start = b->getColStartIdx(i);
+          int end = b->getColStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(C + rows[j], A + i, height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t i = 0; i < b->getWidth(); ++i) {
+          int start = b->getColStartIdx(i);
+          int end = b->getColStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(
+                C + rows[j], A + i, B[j], height_, width_, a->getWidth());
+          }
+        }
+      }
+    }
+  } else {
+    if (!b->isTransposed()) {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), m);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), width_);
+
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t j = 0; j < b->getHeight(); ++j) {
+          int start = b->getRowStartIdx(j);
+          int end = b->getRowStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(C + cols[i], A + j, height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t j = 0; j < b->getHeight(); ++j) {
+          int start = b->getRowStartIdx(j);
+          int end = b->getRowStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(
+                C + cols[i], A + j, B[i], height_, width_, a->getWidth());
+          }
+        }
+      }
+    } else /*if (b->isTransposed())*/ {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), width_);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), m);
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t i = 0; i < b->getHeight(); ++i) {
+          int start = b->getRowStartIdx(i);
+          int end = b->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(C + i, A + cols[j], height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t i = 0; i < b->getHeight(); ++i) {
+          int start = b->getRowStartIdx(i);
+          int end = b->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(
+                C + i, A + cols[j], B[j], height_, width_, a->getWidth());
+          }
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::selectRows(Matrix& table, IVector& ids) {
+  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
+    selectRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
+    selectRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
+  } else {
+    CHECK(table.isContiguous());
+    selectRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
+  }
+}
+
+void CpuMatrix::selectElements(Matrix& table, IVector& ids) {
+  CHECK_EQ(table.getHeight(), ids.getSize());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), 1U);
+  real* tableData = table.getData();
+  int* idsData = ids.getData();
+  for (size_t i = 0; i < table.getHeight(); i++) {
+    data_[i] += tableData[i * table.getWidth() + idsData[i]];
+  }
+}
+
+void CpuMatrix::addElements(Matrix& table, IVector& ids) {
+  CHECK_EQ(table.getHeight(), ids.getSize());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), 1U);
+  real* tableData = table.getData();
+  int* idsData = ids.getData();
+  for (size_t i = 0; i < table.getHeight(); i++) {
+    tableData[i * table.getWidth() + idsData[i]] += data_[i];
+  }
+}
+
+// this.row[i] += table.row[ids[i]]
+template <typename TableMatType>
+void CpuMatrix::selectRowsImp(TableMatType& table, IVector& ids) {
+  CHECK(!table.useGpu());
+  CHECK(!ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  for (size_t i = 0; i < numSamples; ++i) {
+    if (index[i] == -1) continue;
+    CHECK_LT(index[i], (int)tableSize);
+    CHECK_GE(index[i], 0);
+    vecAddTo(a + i * stride_, table.getRow(index[i]), dim);
+  }
+}
+
+void CpuMatrix::addToRows(Matrix& table, IVector& ids) {
+  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
+    addToRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
+  } else if (dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table)) {
+    addToRowsImp(*dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table), ids);
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
+    addToRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
+  } else {
+    CHECK(table.isContiguous());
+    addToRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
+  }
+}
+
+// table.row[ids[i]] += this.row[i]
+template <typename TableMatType>
+void CpuMatrix::addToRowsImp(TableMatType& table, IVector& ids) {
+  CHECK(!table.useGpu());
+  CHECK(!ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  for (size_t i = 0; i < numSamples; ++i) {
+    if (index[i] == -1) continue;
+    CHECK_LT(index[i], (int)tableSize);
+    CHECK_GE(index[i], 0);
+    vecAddTo(table.getRow(index[i]), a + i * stride_, dim);
+  }
+}
+
+static ThreadLocal<std::vector<const real*>> threadLocalColArray;
+
+template <typename MatBType, typename MatCType>
+void CpuMatrix::mul(
+    CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT) {
+  CHECK(!c->isTransposed()) << "Not supported";
+  CHECK(!b->isTransposed()) << "Not supported";
+  // TODO(yuyang18): Maybe bug implementation here.
+  CHECK(scaleAB == 1) << "Not supported";
+  CHECK(scaleT == 0 || scaleT == 1) << "Not supported";
+  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "Not supported";
+
+  real* B = b->getData();
+  real* C = c->getData();
+  size_t height = c->getHeight();
+  size_t width = c->getWidth();
+  int* cols = a->getCols();
+  real* values = a->getValue();
+
+  if (scaleT == 0) {
+    c->zeroMem();
+  }
+
+  if (!a->isTransposed()) {
+    size_t m = a->getWidth();
+    CHECK_EQ(b->getHeight(), m);
+    CHECK_EQ(a->getHeight(), height);
+    CHECK_EQ(b->getWidth(), width);
+
+    if (a->getValueType() == NO_VALUE) {
+      if (width % 32 == 0) {  // use libaddto
+        // @TODO(yuyang18) Make input addr can be unaligned.
+        // So merge this if and else
+        CHECK_EQ((size_t)B % 32, 0UL);
+        CHECK_EQ((size_t)C % 32, 0UL);
+        auto& colArray = *threadLocalColArray;
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          size_t colNum = end - start;
+          colArray.resize(colNum);
+          for (int j = 0; j < end - start; ++j) {
+            colArray[j] = b->getRow(cols[j + start]);
+          }
+          simd::batchAddTo(c->getRow(i), &colArray[0], colNum, width);
+        }
+
+      } else {
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            vecAddTo(c->getRow(i), b->getRow(cols[j]), width);
+          }
+        }
+      }
+    } else if (a->getValueType() == FLOAT_VALUE) {
+      for (size_t i = 0; i < a->getHeight(); ++i) {
+        const int start = a->getRowStartIdx(i);
+        const int end = a->getRowStartIdx(i + 1);
+        for (int j = start; j < end; ++j) {
+          vecAddTo(c->getRow(i), b->getRow(cols[j]), values[j], width);
+        }
+      }
+    }
+  } else /*if (a->isTransposed())*/ {
+    size_t m = a->getHeight();
+    CHECK_EQ(b->getHeight(), m);
+    CHECK_EQ(a->getWidth(), height);
+    CHECK_EQ(b->getWidth(), width);
+    if (a->getValueType() == NO_VALUE) {
+      if (width % 32 == 0) {  // use libaddto
+        // @TODO(yuyang18) Make input addr can be unaligned.
+        // So merge this if and else
+        CHECK_EQ((size_t)B % 32, 0UL);
+        CHECK_EQ((size_t)C % 32, 0UL);
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            simd::addTo(c->getRow(cols[j]), b->getRow(i), width);
+          }
+        }
+
+      } else {
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            vecAddTo(c->getRow(cols[j]), b->getRow(i), width);
+          }
+        }
+      }
+    } else if (a->getValueType() == FLOAT_VALUE) {
+      for (size_t i = 0; i < a->getHeight(); ++i) {
+        const int start = a->getRowStartIdx(i);
+        const int end = a->getRowStartIdx(i + 1);
+        for (int j = start; j < end; ++j) {
+          vecAddTo(c->getRow(cols[j]), b->getRow(i), values[j], width);
+        }
+      }
+    }
+  }
+}
+
+// instantiation mul() called in SparseRowMatrix.cpp
+template void CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(
+    CpuSparseMatrix* a,
+    CpuMatrix* b,
+    SparseRowCpuMatrix* c,
+    real scaleAB,
+    real scaleT);
+template void CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
+    CpuSparseMatrix* a,
+    CpuMatrix* b,
+    SparseAutoGrowRowCpuMatrix* c,
+    real scaleAB,
+    real scaleT);
+template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
+                                                           CpuMatrix* b,
+                                                           CacheRowCpuMatrix* c,
+                                                           real scaleAB,
+                                                           real scaleT);
+
+#ifndef PADDLE_MOBILE_INFERENCE
+void SharedCpuMatrix::mul(CpuSparseMatrix* a,
+                          CpuMatrix* b,
+                          real scaleAB,
+                          real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+  CHECK(!b->isTransposed()) << "Not supported";
+  CHECK_EQ(scaleAB, 1) << "Not supported";
+  CHECK_EQ(scaleT, 1) << "Not supported";
+  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "not supported";
+
+  real* B = b->getData();
+  real* C = getData();
+  size_t height = getHeight();
+  size_t width = getWidth();
+
+  // get real trans
+  MatrixPtr aTrans;
+  if (a->isTransposed()) {
+    aTrans = a->getTmpSparseMatrix(a->getWidth(), a->getHeight());
+    a->transpose(aTrans, false);
+  }
+  a = dynamic_cast<CpuSparseMatrix*>(aTrans.get());
+
+  size_t m = a->getWidth();
+  CHECK_EQ(b->getHeight(), m);
+  CHECK_EQ(a->getHeight(), height);
+  CHECK_EQ(b->getWidth(), width);
+
+  size_t blockSize = (height / blockNum_) + 1;
+  CpuMatrixPtr localBuf = *localBuf_;
+  if (!localBuf) {
+    localBuf = std::make_shared<CpuMatrix>(blockSize, width);
+  } else {
+    localBuf->resize(blockSize, width);
+  }
+  localBuf->zeroMem();
+  real* localC = localBuf->getData();
+  std::vector<int>& blockSeq = *blockSeq_;
+  if (blockSeq.size() == 0) {
+    for (int k = 0; k < blockNum_; ++k) {
+      blockSeq.push_back(k);
+    }
+    std::shuffle(
+        blockSeq.begin(), blockSeq.end(), ThreadLocalRandomEngine::get());
+  }
+  std::vector<int>& localBufRows = *localBufRows_;
+  int* cols = a->getCols();
+  real* value = a->getValue();
+
+  for (int k = 0; k < blockNum_; ++k) {
+    int blockId = blockSeq[k];
+    size_t blockBegin = blockId * blockSize;
+    size_t blockEnd = (blockId + 1) * blockSize;
+    if (blockId == blockNum_ - 1) {
+      blockEnd = height;
+    }
+    if (a->getValueType() == NO_VALUE) {
+      for (size_t i = blockBegin; i < blockEnd; ++i) {
+        int start = a->getRowStartIdx(i);
+        int end = a->getRowStartIdx(i);
+        size_t colNum = a->getColNum(i);
+        if (colNum == 0) {
+          continue;
+        }  // skip empty row
+        localBufRows.push_back(i);
+        size_t bufPos = localBufRows.size() - 1;
+        for (int j = start; j < end; ++j) {
+          vecAddTo(localC + bufPos * width, B + cols[j] * width, width);
+        }
+      }
+    } else if (a->getValueType() == FLOAT_VALUE) {
+      for (size_t i = blockBegin; i < blockEnd; ++i) {
+        int start = a->getRowStartIdx(i);
+        int end = a->getRowStartIdx(i);
+        size_t colNum = a->getColNum(i);
+        if (colNum == 0) {
+          continue;
+        }  // skip empty row
+        localBufRows.push_back(i);
+        size_t bufPos = localBufRows.size() - 1;
+        for (int j = start; j < end; ++j) {
+          vecAddTo(
+              localC + bufPos * width, B + cols[j] * width, value[j], width);
+        }
+      }
+    }
+
+    {
+      std::lock_guard<std::mutex> guard(*blockLocks_[blockId]);
+      for (size_t i = 0; i < localBufRows.size(); ++i) {
+        vecAddTo(C + localBufRows[i] * width, localC + i * width, width);
+      }
+    }
+    memset(localC, 0, localBufRows.size() * width * sizeof(real));
+    localBufRows.clear();
+  }
+
+  VLOG(2) << " B[0]=" << B[0] << " B[1]=" << B[1] << " C[0]=" << C[0]
+          << " C[1]=" << C[1];
+}
+
+void SharedCpuMatrix::add(Matrix& b, real p1, real p2) {
+  CHECK_EQ(blockNum_, 1);
+  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
+  CpuMatrix::add(b, p1, p2);
+}
+
+void SharedCpuMatrix::add(real p1, real p2) {
+  CHECK_EQ(blockNum_, 1);
+  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
+  CpuMatrix::add(p1, p2);
+}
+
+void SharedCpuMatrix::initShared(int blockNum) {
+  CHECK_GT(height_ * width_, 1UL * 1024 * 1024)
+      << "should not share small matrix";
+  initBlock(blockNum);
+}
+
+void SharedCpuMatrix::initBlock(int blockNum) {
+  CHECK_LE(blockNum, 200) << "should not use large block number";
+  blockNum_ = blockNum;
+  blockLocks_.resize(blockNum);
+  for (auto& locker : blockLocks_) {
+    locker.reset(new std::mutex);
+  }
+}
+
+#endif
+/* Add a (column) vector b to matrix a, column by column */
+void CpuMatrix::addColumnVector(const Matrix& b) {
+  BaseMatrix::addColVector(const_cast<Matrix&>(b));
+}
+
+/* this = a*b */
+void CpuMatrix::mul(const Matrix& a, const Matrix& b) {
+  return mul(a, b, 1.0, 0.0);
+}
+
+/* this = scaleAB*(this*b) +  scaleT*this */
+void CpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
+  (void)b;
+  (void)scaleAB;
+  (void)scaleT;
+  LOG(FATAL) << "Not implemented";
+}
+
+/* this = this* b */
+void CpuMatrix::rightMul(Matrix& b) { return rightMul(b, 1.0, 0.0); }
+
+/* this = scaleAB*(a*this) +  scaleT*this */
+void CpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
+  (void)a;
+  (void)scaleAB;
+  (void)scaleT;
+  LOG(FATAL) << "Not implemented";
+}
+
+/* this = a*this) */
+void CpuMatrix::leftMul(Matrix& a) { return leftMul(a, 1.0, 0.0); }
+
+void CpuMatrix::colMerge(Matrix& src) { src.rowSum(*this); }
+
+void CpuMatrix::rowSum(Matrix& sum) {
+  CHECK_EQ(sum.getHeight(), getHeight());
+  CHECK_EQ(sum.getWidth(), (size_t)1);
+
+  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
+}
+
+void CpuMatrix::rowMaxId(IVector& maxIds) {
+  CHECK(!maxIds.useGpu()) << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  CHECK_EQ(maxIds.getSize(), numSamples);
+
+  real* a = getData();
+  int* s = maxIds.getData();
+  size_t dim = getWidth();
+
+  for (size_t i = 0; i < numSamples; i++) {
+    real sm = a[i * dim];
+    int maxId = 0;
+    for (size_t j = 1; j < dim; j++) {
+      if (a[i * dim + j] > sm) {
+        maxId = j;
+        sm = a[i * dim + j];
+      }
+    }
+    s[i] = maxId;
+  }
+}
+
+void CpuMatrix::rowMax(Matrix& max) {
+  CHECK_EQ(max.getHeight(), getHeight());
+  CHECK_EQ(max.getWidth(), (size_t)1);
+  max.maxRows(*this);
+}
+
+/* Get the top k elements of each row of this matrix */
+void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+  CHECK(isContiguous());
+  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(maxVal.getWidth(), beam);
+
+  real* a = getData();
+  int* s = maxIds.getData();
+  real* t = maxVal.getData();
+  size_t dim = getWidth();
+  for (size_t i = 0; i < numSamples; i++) {
+    std::vector<std::pair<real, size_t>> vec;
+    for (size_t j = 0; j < dim; j++) {
+      vec.push_back(std::pair<real, size_t>(a[i * dim + j], j));
+    }
+
+    std::partial_sort(
+        vec.begin(),
+        vec.begin() + beam,
+        vec.end(),
+        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
+          return l.first > r.first;
+        });
+    for (size_t j = 0; j < beam; j++) {
+      t[i * beam + j] = vec[j].first;
+      s[i * beam + j] = vec[j].second;
+    }
+  }
+}
+
+void CpuMatrix::colMax(Matrix& max) {
+  CHECK_EQ(max.getWidth(), getWidth());
+  CHECK_EQ(max.getHeight(), (size_t)1);
+  max.maxCols(*this);
+}
+
+void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+  CHECK(isContiguous());
+  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getWidth();
+  size_t beam = maxVal.getHeight();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getWidth(), numSamples);
+
+  real* a = getData();
+  int* s = maxIds.getData();
+  real* t = maxVal.getData();
+  size_t dim = getHeight();
+  for (size_t i = 0; i < numSamples; i++) {
+    std::vector<std::pair<real, size_t>> vec;
+    for (size_t j = 0; j < dim; j++) {
+      vec.push_back(std::pair<real, size_t>(a[i + j * numSamples], j));
+    }
+
+    std::partial_sort(
+        vec.begin(),
+        vec.begin() + beam,
+        vec.end(),
+        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
+          return l.first > r.first;
+        });
+    for (size_t j = 0; j < beam; j++) {
+      t[i + j * numSamples] = vec[j].first;
+      s[i + j * numSamples] = vec[j].second;
+    }
+  }
+}
+
+void CpuMatrix::maxoutForward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
+                              size_t groups) {
+  CHECK(dynamic_cast<CpuMatrix*>(&a));
+  CHECK(dynamic_cast<CpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = getWidth();
+  size_t batchSize = getHeight();
+  size_t featLen = size / channels;
+  const real* input = a.getData();
+  int* idForCpu = id.getData();
+
+  MatrixPtr maxInMat, maxOutMat;
+  Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
+  Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
+
+  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+    size_t newIndex = batch_idx * size;
+    IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
+
+    for (size_t i = 0; i < channels; ++i) {
+      size_t newFeatLen = i * featLen;
+      for (size_t j = 0; j < groups; ++j) {
+        maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
+            ->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
+                       featLen);
+      }
+    }
+    maxInMat->colMax(*tmpId, *maxOutMat);
+    this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
+  }
+}
+
+void CpuMatrix::maxoutBackward(Matrix& a,
+                               IVector& id,
+                               size_t channels,
+                               size_t groups) {
+  CHECK(dynamic_cast<CpuMatrix*>(&a));
+  CHECK(dynamic_cast<CpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = a.getWidth();
+  size_t batchSize = getHeight();
+  size_t featLen = size / channels;
+  size_t newFeatLen = groups * featLen;
+  real* inputG = getData();
+  const real* outG = a.getData();
+  int* idForCpu = id.getData();
+
+  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+    size_t newIndex = batch_idx * size;
+    int* idData = idForCpu + newIndex;
+
+    for (size_t i = 0; i < size; ++i) {
+      int gradIdx =
+          idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
+      (inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
+    }
+  }
+}
+
+void CpuMatrix::rowNormalizeL1(Matrix& out) {
+  CHECK(!out.useGpu());
+
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(out.getHeight(), numSamples);
+  CHECK_EQ(out.getWidth(), dim);
+  real* a = getData();
+  real* b = out.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    real s = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      s += a[i * dim + j];
+    }
+    // Right now, we just bet that sum won't be zero. If this really happens,
+    // we will figure out what should be done then.
+    CHECK_GT(s, 0);
+    s = 1 / s;
+    for (size_t j = 0; j < dim; ++j) {
+      b[i * dim + j] = s * a[i * dim + j];
+    }
+  }
+}
+
+/* calulate classification error */
+void CpuMatrix::classificationError(Matrix& output,
+                                    IVector& label,
+                                    size_t topkSize) {
+  size_t numSamples = this->getHeight();
+  auto cpuOutput = dynamic_cast<CpuMatrix*>(&output);
+  auto cpuLabel = dynamic_cast<CpuIVector*>(&label);
+  IVectorPtr cpuTopIds = std::make_shared<CpuIVector>(numSamples * topkSize);
+  MatrixPtr cpuTopVal = std::make_shared<CpuMatrix>(numSamples, topkSize);
+
+  CHECK(cpuOutput && cpuLabel) << "Invalid argument pointer";
+  CHECK(cpuTopIds && cpuTopVal) << "Allocate cpu memory failed";
+  CHECK(cpuLabel->getSize() == numSamples) << "Vector size is not equal";
+  CHECK(cpuOutput->getHeight() == numSamples && this->getWidth() == 1)
+      << "Matrix dimensions are not equal";
+
+  // top k matrix classification
+  cpuOutput->rowMax(*cpuTopIds, *cpuTopVal);
+
+  size_t dim = cpuOutput->getWidth();
+  real* result = this->getData();
+  int* ids = cpuTopIds->getData();
+  int* lbl = cpuLabel->getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    CHECK_GE(lbl[i], 0);
+    CHECK_LT((size_t)lbl[i], dim);
+
+    for (size_t j = 0; j < topkSize; ++j) {
+      if (ids[j + i * topkSize] == lbl[i]) {
+        result[i] = 0;
+        break;
+      }
+      result[i] = 1.0f;
+    }
+  }
+}
+
+/* copy -log(output[label]) to this->data[i] */
+void CpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getSize(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(getWidth(), (size_t)1);
+
+  real* out = output.getData();
+  real* cost = getData();
+  int* lbl = label.getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    CHECK_GE(lbl[i], 0);
+    CHECK_LT((size_t)lbl[i], dim);
+    cost[i] = -std::log(out[lbl[i]]);
+  }
+}
+
+/* calculate the error of outputV according to label */
+void CpuMatrix::oneHotCrossEntropyBp(Matrix& output, IVector& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getWidth(), dim);
+  real* out = output.getData();
+  real* grad = getData();
+  int* lbl = label.getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
+    grad[lbl[i]] -= 1 / out[lbl[i]];
+  }
+}
+
+/*
+    We implement the matrix functionality in CostLayer.cpp,
+    but we define the scalar function here for sanity check
+    deletion of the function does not affect anything neverthelss
+*/
+void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                               IVector& label,
+                                               real alpha) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getSize(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(getWidth(), (size_t)1);
+
+  real* out = output.getData();
+  real* cost = getData();
+  int* lbl = label.getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    CHECK_GE(lbl[i], 0);
+    CHECK_LT((size_t)lbl[i], dim);
+    real sum = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      sum += out[j];
+    }
+    sum = _safelog(sum);
+    cost[i] = -_safelog(out[lbl[i]]) + sum + alpha * _square(sum);
+  }
+}
+
+/*
+    We implement the matrix functionality in CostLayer.cpp,
+    but we define the scalar function here for sanity check
+    deletion of the function does not affect anything neverthelss
+*/
+void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output,
+                                                 IVector& label,
+                                                 real alpha) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getWidth(), dim);
+  real* out = output.getData();
+  real* grad = getData();
+  int* lbl = label.getData();
+
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
+    grad[lbl[i]] -= 1 / out[lbl[i]];
+    real sum = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      sum += out[j];
+    }
+    for (size_t j = 0; j < dim; ++j) {
+      if (j == (size_t)lbl[i]) {
+        grad[j] += -1 / out[j];
+      }
+      grad[j] += 1 / sum + 2 * alpha * _safelog(sum) / sum;
+    }
+  }
+}
+
+#define FORWARD_LOOP()                      \
+  size_t numSamples = getHeight();          \
+  size_t dim = getWidth();                  \
+  CHECK_EQ(output.getHeight(), numSamples); \
+  CHECK_EQ(output.getWidth(), dim);         \
+  const real* in = getData();               \
+  real* out = output.getData();             \
+  for (size_t i = 0; i < numSamples; ++i, in += dim, out += dim)
+
+#define BACKWARD_LOOP()                     \
+  size_t numSamples = getHeight();          \
+  size_t dim = getWidth();                  \
+  CHECK_EQ(output.getHeight(), numSamples); \
+  CHECK_EQ(output.getWidth(), dim);         \
+  real* grad = getData();                   \
+  real* out = output.getData();             \
+  for (size_t i = 0; i < numSamples; ++i, grad += dim, out += dim)
+
+void CpuMatrix::softmax(Matrix& output) {
+  CHECK(!output.useGpu());
+
+  const float THRESHOLD = -64.0;
+
+  FORWARD_LOOP() {
+    real max = -1.0e20;
+    for (size_t j = 0; j < dim; ++j) {
+      if (in[j] > max) {
+        max = in[j];
+      }
+    }
+    for (size_t j = 0; j < dim; ++j) {
+      real a = in[j] - max;
+      if (a < THRESHOLD) {
+        a = THRESHOLD;
+      }
+      out[j] = a;
+    }
+    vExp(dim, out, out);
+
+    real sum = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      sum += out[j];
+    }
+    sum = 1 / sum;
+    for (size_t j = 0; j < dim; ++j) {
+      out[j] *= sum;
+    }
+  }
+}
+
+void CpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
+  CHECK_EQ(getWidth(), 1UL);
+  CHECK_EQ(output.getWidth(), 1UL);
+  CHECK(isContiguous());
+
+  MatrixPtr inTmp = Matrix::create(nullptr,
+                                   /* height= */ 1,
+                                   1,
+                                   /* trans= */ false,
+                                   false);
+  MatrixPtr outTmp = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    1,
+                                    /* trans= */ false,
+                                    false);
+  size_t numSequences = index.getSize() - 1;
+  auto starts = index.getData();
+  for (size_t i = 0; i < numSequences; ++i) {
+    size_t offset = starts[i];
+    size_t size = starts[i + 1] - starts[i];
+    inTmp->setData(getData() + offset, 1UL, size);
+    outTmp->setData(output.getData() + offset, 1UL, size);
+    inTmp->softmax(*outTmp);
+  }
+}
+
+void CpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
+  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
+  CHECK_EQ(getHeight(), sftmaxSum.getHeight());
+
+  real* sums = sftmaxSum.getData();
+
+  BACKWARD_LOOP() {
+    real sum = sums[i];
+    for (size_t j = 0; j < dim; ++j) {
+      grad[j] = out[j] * (grad[j] - sum);
+    }
+  }
+}
+
+void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), (size_t)1);
+  real* out = output.getData();
+  real* cost = getData();
+
+  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
+  if (labelptr) {
+    // it is a CpuSparseMatrix
+    if (labelptr->getFormat() == SPARSE_CSR) {
+      // treat label as a SparseMatrix
+      for (size_t i = 0; i < numSamples; ++i) {
+        for (size_t j = 0; j < dim; ++j) {
+          cost[i] += _square(out[i * dim + j]);
+        }
+      }
+      if (labelptr->getValueType() == NO_VALUE) {
+        int* cols = labelptr->getCols();
+        for (size_t i = 0; i < numSamples; ++i) {
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
+            cost[i] += 1.0 - 2.0 * out[i * dim + cols[j]];
+            /*
+             * explanation of above line: original codes are follows:
+             * cost[i] -= _square(out[i * dim + feature.col]);
+             * cost[i] += _square(1.0 - out[i * dim + feature.col]);
+             */
+          }
+        }
+      } else if (labelptr->getValueType() == FLOAT_VALUE) {
+        int* cols = labelptr->getCols();
+        real* values = labelptr->getValue();
+        for (size_t i = 0; i < numSamples; ++i) {
+          real sum1 = 0;
+          real sum2 = 0;
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
+            sum1 += values[j] * values[j];
+            sum2 += values[j] * out[i * dim + cols[j]];
+            /*
+             * explanation of above line: original codes are follows:
+             * cost[i] -= _square(out[i * dim + feature.col]);
+             * cost[i] += _square(value.col - out[i * dim + feature.col]);
+             */
+          }
+          cost[i] += sum1 - 2.0 * sum2;
+        }
+      } else {
+        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
+        return;
+      }
+      return;
+    } else {
+      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
+      return;
+    }
+  }
+
+  BaseMatrix::sumOfSquaredDiffs(output,
+                                label,
+                                /* scaleSum= */ 1,
+                                /* scaleDest= */ 1);
+}
+
+/* calculate the error of outputV according to label */
+void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getWidth(), dim);
+  CHECK_EQ(label.getWidth(), dim);
+
+  real* out = output.getData();
+  real* grad = getData();
+
+  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
+  if (labelptr) {
+    // it is a CpuSparseMatrix
+    if (labelptr->getFormat() == SPARSE_CSR) {
+      // treat label as a SparseMatrix
+      for (size_t i = 0; i < numSamples; ++i) {
+        for (size_t j = 0; j < dim; ++j) {
+          grad[i * dim + j] += 2.0 * out[i * dim + j];
+        }
+      }
+      if (labelptr->getValueType() == NO_VALUE) {
+        int* cols = labelptr->getCols();
+        for (size_t i = 0; i < numSamples; ++i) {
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
+            grad[i * dim + cols[j]] -= 2.0;
+            /*
+             * explanation of above line: original codes are follows:
+             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
+             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
+             * - 1);
+             */
+          }
+        }
+      } else if (labelptr->getValueType() == FLOAT_VALUE) {
+        int* cols = labelptr->getCols();
+        real* values = labelptr->getValue();
+        for (size_t i = 0; i < numSamples; ++i) {
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
+            grad[i * dim + cols[j]] -= 2.0 * values[j];
+            /*
+             * explanation of above line: original codes are follows:
+             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
+             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
+             * - value.col);
+             */
+          }
+        }
+      } else {
+        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
+        return;
+      }
+      return;
+    } else {
+      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
+      return;
+    }
+  }
+
+  real* lbl = label.getData();
+  size_t ld = getStride();
+  size_t outLd = output.getStride();
+  size_t lblLd = label.getStride();
+  CHECK(lbl);
+  for (size_t i = 0; i < numSamples;
+       ++i, out += outLd, lbl += lblLd, grad += ld) {
+    for (size_t j = 0; j < dim; ++j) {
+      grad[j] += 2.0 * (out[j] - lbl[j]);  // positive gradient;
+    }
+  }
+}
+
+void CpuMatrix::smoothL1(Matrix& output, Matrix& label, real destScale) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), (size_t)1);
+
+  real* cost = getData();
+  real* out = output.getData();
+  real* lbl = label.getData();
+
+  for (size_t i = 0; i < numSamples; ++i, out += dim, lbl += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      real absVal = std::fabs(out[j] - lbl[j]);
+      cost[i] *= destScale;
+      if (absVal < 1.0)
+        cost[i] += 0.5 * absVal * absVal;
+      else
+        cost[i] += absVal - 0.5;
+    }
+  }
+}
+
+void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label, real destScale) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), dim);
+
+  real* out = output.getData();
+  real* lbl = label.getData();
+  real* grad = getData();
+
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim, lbl += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      real val = out[j] - lbl[j];
+      grad[j] *= destScale;
+      if (std::fabs(val) < 1) {
+        grad[j] += val;
+      } else {
+        grad[j] += (real(0) < val) - (val < real(0));
+      }
+    }
+  }
+}
+
+void CpuMatrix::tanh(Matrix& output) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), dim);
+  vTanh(numSamples * dim, getData(), output.getData());
+}
+
+void CpuMatrix::tanhDerivative(Matrix& output) {
+  BaseMatrix::tanhDerivative(output);
+}
+
+void CpuMatrix::softrelu(Matrix& output) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  const real THRESHOLD = 40.0;
+  FORWARD_LOOP() {  // TODO(yuyang18): SIMD it?
+    for (size_t j = 0; j < dim; ++j) {
+      real x = in[j];
+      if (x > THRESHOLD) {
+        x = THRESHOLD;
+      } else if (x < -THRESHOLD) {
+        x = -THRESHOLD;
+      }
+      out[j] = x;
+    }
+  }
+  vExp(numSamples * dim, output.getData(), output.getData());
+  vLog1p(numSamples * dim, output.getData(), output.getData());
+}
+
+void CpuMatrix::softreluDerivative(Matrix& output) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  size_t size = numSamples * dim;
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), dim);
+  real* grad = getData();
+  MatrixPtr tmpMat = Matrix::create(numSamples, dim);
+  real* tmp = tmpMat->getData();
+
+  vExp(size, output.getData(), tmpMat->getData());
+
+  for (size_t i = 0; i < size; ++i) {
+    grad[i] *= (1.0 - 1.0 / tmp[i]);
+  }
+}
+
+void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), dim);
+
+  const real* in = getData();
+  real* out = output.getData();
+
+  // out = p2*in
+  for (size_t i = 0; i < numSamples * dim; ++i) {
+    out[i] = p2 * in[i];
+  }
+
+  vTanh(numSamples * dim, out, out);
+
+  // out = p1 * out
+  for (size_t i = 0; i < numSamples * dim; ++i) {
+    out[i] = p1 * out[i];
+  }
+}
+
+/* uniform randomization, minimize precision = 1e-5 */
+void CpuMatrix::randomizeUniform() {
+  CHECK(isContiguous());
+  real* data = getData();
+  unsigned int* randSeed = ThreadLocalRand::getSeed();
+  real recipRandMax = 1.0f / (real)RAND_MAX;
+  for (size_t i = 0; i < elementCnt_; ++i) {
+    *data++ = rand_r(randSeed) * recipRandMax;
+  }
+}
+
+void CpuMatrix::print(std::ostream& os) const {
+  CHECK(isContiguous());
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      os << data_[i * width_ + j] << " ";
+    }
+    os << std::endl;
+  }
+}
+
+void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
+  real* input = data.getData();
+  real* w = W.getData();
+  real* output = data_;
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+
+  size_t partial_sum = numElements / paraSize;
+  if (paraSize == numElements) {
+    for (size_t n = 0; n < numSamples * numElements; ++n) {
+      output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements];
+    }
+    return;
+  }
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  for (size_t n = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < paraSize; i++) {
+      neon::prelu(
+          input + i * partial_sum, w[i], output + i * partial_sum, partial_sum);
+    }
+    input = input + numElements;
+    output = output + numElements;
+  }
+#else
+  for (size_t n = 0, k = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < numElements; ++i, ++k) {
+      output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
+    }
+  }
+#endif
+}
+
+void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
+  real* ograd = oGrad.getData();
+  real* input = data.getData();
+  real* wgrad = data_;
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  for (size_t n = 0, k = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < numElements; ++i, ++k) {
+      wgrad[i / partial_sum] += ograd[k] * (input[k] > 0 ? 0 : input[k]);
+    }
+  }
+}
+
+void CpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
+  real* diff = data_;
+  real* input = data.getData();
+  real* ograd = oGrad.getData();
+  real* w = W.getData();
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  for (size_t n = 0, k = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < numElements; ++i, ++k) {
+      diff[k] += ograd[k] * (input[k] > 0 ? 1 : w[i / partial_sum]);
+    }
+  }
+}
+
+void CpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
+  CHECK(isContiguous());
+  size_t h = height_ < height ? height_ : height;
+  size_t w = width_ < width ? width_ : width;
+  os.setf(std::ostream::scientific);
+  os << "[";
+  for (size_t i = 0; i < h; ++i) {
+    for (size_t j = 0; j < w; ++j) {
+      os << data_[i * width_ + j] << " ";
+    }
+    if (i == h - 1) {
+      os << "]";
+    }
+    os << std::endl;
+  }
+}
+
+void CpuMatrix::printOneRow(std::ostream& os, size_t idx) const {
+  CHECK_LT(idx, height_);
+  size_t offset = idx * stride_;
+  os << data_[offset];
+  for (size_t i = 1; i < width_; ++i) {
+    os << " " << data_[offset + i];
+  }
+  os << ";";
+}
+
+void CpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
+  CHECK(isContiguous());
+  CHECK(height_ == refMat.getHeight());
+  CHECK(width_ == refMat.getWidth());
+  CpuMatrix cpuRef(height_, width_);
+  cpuRef.copyFrom(refMat);
+  size_t diffCnt = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      real a = getElement(i, j);
+      real b = cpuRef.getElement(i, j);
+      if (fabs(a - b) > 0.00001) {
+        ++diffCnt;
+        if (printDiff) {
+          os << "ref= " << a << "  check= " << b << std::endl;
+        }
+      }
+    }
+  }
+  LOG(INFO) << "the  diffCnt is " << diffCnt;
+}
+
+real CpuMatrix::getMin() {
+  size_t size = getHeight() * getWidth();
+  real* data = getData();
+  real res = data[0];
+  for (size_t i = 1; i < size; ++i) {
+    if (res > data[i]) {
+      res = data[i];
+    }
+  }
+  return res;
+}
+
+real CpuMatrix::getMax() {
+  size_t size = getHeight() * getWidth();
+  real* data = getData();
+  real res = data[0];
+  for (size_t i = 1; i < size; ++i) {
+    if (res < data[i]) {
+      res = data[i];
+    }
+  }
+  return res;
+}
+
+void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) {
+  size_t height = this->getHeight();
+  size_t width0 = this->getWidth();
+  size_t width1 = in1.getWidth();
+
+  CHECK_EQ(height, in0.getHeight());
+  CHECK_EQ(width0, in0.getWidth());
+  CHECK_EQ(height, in1.getHeight());
+
+  CHECK_EQ(width1 % 2, 1U);
+
+  real* outV = this->getData();
+  real* inV0 = in0.getData();
+  real* inV1 = in1.getData();
+
+  int leftCtxLen = (width1 - 1) / 2;
+  for (size_t x = 0; x < height;
+       ++x, outV += width0, inV0 += width0, inV1 += width1) {
+    for (size_t i = 0; i < width0; ++i) {  // each dimension of output
+      for (size_t j = 0; j < width1; ++j) {
+        // iterate over all dimentions of inV1
+        int index = i + j - leftCtxLen;
+        index = (index + width0) % width0;
+        outV[i] += inV0[index] * inV1[j];
+      }
+    }
+  }
+}
+
+void CpuMatrix::circularConvDerivative(
+    Matrix& outG, Matrix& in0, Matrix& in1, Matrix& inG0, Matrix& inG1) {
+  size_t height = in0.getHeight();
+  size_t width0 = in0.getWidth();
+  size_t width1 = in1.getWidth();
+
+  CHECK_EQ(height, in1.getHeight());
+  CHECK_EQ(height, inG0.getHeight());
+  CHECK_EQ(width0, inG0.getWidth());
+  CHECK_EQ(height, inG1.getHeight());
+  CHECK_EQ(width1, inG1.getWidth());
+  CHECK_EQ(height, outG.getHeight());
+  CHECK_EQ(width0, outG.getWidth());
+
+  real* outGV = outG.getData();
+  real* inV0 = in0.getData();
+  real* inV1 = in1.getData();
+  real* inGV0 = inG0.getData();
+  real* inGV1 = inG1.getData();
+
+  int leftCtxLen = (width1 - 1) / 2;
+  for (size_t x = 0; x < height; ++x,
+              outGV += width0,
+              inV0 += width0,
+              inV1 += width1,
+              inGV0 += width0,
+              inGV1 += width1) {
+    for (size_t j = 0; j < width1; ++j) {  // iterate over width1
+      for (size_t i = 0; i < width0; ++i) {
+        // such over all dimensions of outG
+        int index = i + j - leftCtxLen;
+        index = (index + width0) % width0;
+        inGV0[index] += outGV[i] * inV1[j];
+        inGV1[j] += outGV[i] * inV0[index];
+      }
+    }
+  }
+}
+
+void CpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
+  CHECK(labelPtr);
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(numSamples, output.getHeight());
+  CHECK_EQ(numSamples, labelPtr->getHeight());
+  CHECK_EQ(dim, labelPtr->getWidth());
+
+  real* out = output.getData();
+  real* cost = getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      CHECK(out[j] > 0 && out[j] < 1.0);
+      cost[i] -= std::log(1 - out[j]);
+    }
+
+    const int* cols = labelPtr->getRowCols(i);
+    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      cost[i] -= std::log(out[cols[j]] / (1 - out[cols[j]]));
+    }
+  }
+}
+
+void CpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
+  CHECK(labelPtr);
+
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(numSamples, output.getHeight());
+  CHECK_EQ(numSamples, labelPtr->getHeight());
+  CHECK_EQ(dim, output.getWidth());
+  CHECK_EQ(dim, labelPtr->getWidth());
+
+  real* out = output.getData();
+  real* grad = getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      CHECK(out[j] > 0 && out[j] < 1.0);
+      grad[j] += 1.0 / (1 - out[j]);
+    }
+
+    const int* cols = labelPtr->getRowCols(i);
+    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      grad[cols[j]] -= 1.0 / (out[cols[j]] * (1 - out[cols[j]]));
+    }
+  }
+}
+
+/* calculate the classification error for multi binary label */
+void CpuMatrix::classificationErrorMulti(Matrix& output,
+                                         Matrix& label,
+                                         real threshold) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
+  CHECK(labelPtr);
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(numSamples, output.getHeight());
+  CHECK_EQ(numSamples, labelPtr->getHeight());
+  CHECK_EQ(dim, labelPtr->getWidth());
+
+  real* out = output.getData();
+  real* result = getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    real sum = 0.0;
+    for (size_t j = 0; j < dim; ++j) {
+      if (out[j] >= threshold) {
+        sum += 1.0;
+      }
+    }
+
+    const int* cols = labelPtr->getRowCols(i);
+    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      if (out[cols[j]] < threshold) {
+        sum += 1.0;
+      } else {
+        sum -= 1.0;
+      }
+    }
+    result[i] = sum / dim;
+  }
+}
+
+void CpuMatrix::bilinearForward(const Matrix& in,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {
+  CHECK(dynamic_cast<const CpuMatrix*>(&in));
+
+  size_t outputW = getWidth();
+  size_t batchSize = getHeight();
+  size_t inputW = in.getWidth();
+  size_t inputH = in.getHeight();
+  size_t inPosOffset = inImgH * inImgW;
+  size_t outPosOffset = outImgH * outImgW;
+  (void)(inputH);
+
+  real* outData = getData();
+  const real* inData = in.getData();
+
+  if (inImgH == outImgH && inImgW == outImgW) {
+    this->copyFrom(in);
+  } else {
+    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
+      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
+        size_t h = ratioH * i;
+        size_t hid = (h < inImgH - 1) ? 1 : 0;
+        real h1lambda = ratioH * i - h;
+        real h2lambda = 1 - h1lambda;
+
+        for (size_t j = 0; j < outImgW; ++j) {
+          size_t w = ratioW * j;
+          size_t wid = (w < inImgW - 1) ? 1 : 0;
+          real w1lambda = ratioW * j - w;
+          real w2lambda = 1 - w1lambda;
+          // calculate four position for bilinear interpolation
+          const real* inPos = &inData[k * inputW + h * inImgW + w];
+          real* outPos = &outData[k * outputW + i * outImgW + j];
+          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
+            // bilinear interpolation
+            outPos[0] =
+                h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) +
+                h1lambda * (w2lambda * inPos[hid * inImgW] +
+                            w1lambda * inPos[hid * inImgW + wid]);
+            inPos += inPosOffset;
+            outPos += outPosOffset;
+          }
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::bilinearBackward(const Matrix& out,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW) {
+  CHECK(dynamic_cast<const CpuMatrix*>(&out));
+
+  size_t inputW = getWidth();
+  size_t inputH = getHeight();
+  size_t outputW = out.getWidth();
+  size_t batchSize = out.getHeight();
+  size_t inPosOffset = inImgH * inImgW;
+  size_t outPosOffset = outImgH * outImgW;
+  (void)(inputH);
+
+  real* inGrad = getData();
+  const real* outGrad = out.getData();
+
+  if (inImgH == outImgH && inImgW == outImgW) {
+    this->add(const_cast<Matrix&>(out));
+  } else {
+    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
+      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
+        size_t h = ratioH * i;
+        size_t hid = (h < inImgH - 1) ? 1 : 0;
+        real h1lambda = ratioH * i - h;
+        real h2lambda = 1 - h1lambda;
+        for (size_t j = 0; j < outImgW; ++j) {
+          size_t w = ratioW * j;
+          size_t wid = (w < inImgW - 1) ? 1 : 0;
+          real w1lambda = ratioW * j - w;
+          real w2lambda = 1 - w1lambda;
+
+          real* inPos = &inGrad[k * inputW + h * inImgW + w];
+          const real* outPos = &outGrad[k * outputW + i * outImgW + j];
+          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
+            inPos[0] += h2lambda * w2lambda * outPos[0];
+            inPos[wid] += h2lambda * w1lambda * outPos[0];
+            inPos[hid * inImgW] += h1lambda * w2lambda * outPos[0];
+            inPos[hid * inImgW + wid] += h1lambda * w1lambda * outPos[0];
+            inPos += inPosOffset;
+            outPos += outPosOffset;
+          }
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::vol2Col(real* data,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW) {
+  real* outData = getData();
+  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
+  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
+  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
+
+  int channelsCol = channels * filterD * filterH * filterW;
+  for (int c = 0; c < channelsCol; ++c) {
+    int wOffset = c % filterW;
+    int hOffset = (c / filterW) % filterH;
+    int dOffset = (c / filterW / filterH) % filterD;
+    int cIn = c / filterW / filterH / filterD;
+    for (int d = 0; d < outDepth; ++d) {
+      for (int h = 0; h < outHeight; ++h) {
+        for (int w = 0; w < outWidth; ++w) {
+          int dPad = d * strideD - paddingD + dOffset;
+          int hPad = h * strideH - paddingH + hOffset;
+          int wPad = w * strideW - paddingW + wOffset;
+
+          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
+              dPad >= 0 && dPad < depth)
+            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] =
+                data[((cIn * depth + dPad) * height + hPad) * width + wPad];
+          else
+            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] = 0;
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::col2Vol(real* trg,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW,
+                        real alpha,
+                        real beta) {
+  real* src = getData();
+  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
+  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
+  int channelsCol = channels * filterD * filterH * filterW;
+  for (int c = 0; c < channelsCol; ++c) {
+    int wOffset = c % filterW;
+    int hOffset = (c / filterW) % filterH;
+    int dOffset = (c / filterW / filterH) % filterD;
+    int cIm = c / filterW / filterH / filterD;
+    for (int d = 0; d < outDepth; ++d) {
+      for (int h = 0; h < outHeight; ++h) {
+        for (int w = 0; w < outWidth; ++w) {
+          int dPad = d * strideD - paddingD + dOffset;
+          int hPad = h * strideH - paddingH + hOffset;
+          int wPad = w * strideW - paddingW + wOffset;
+          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
+              dPad >= 0 && dPad < depth)
+            trg[((cIm * depth + dPad) * height + hPad) * width + wPad] =
+                alpha *
+                    src[((c * outDepth + d) * outHeight + h) * outWidth + w] +
+                beta *
+                    trg[((cIm * depth + dPad) * height + hPad) * width + wPad];
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////
+//               functions executed via cpu                   //
+////////////////////////////////////////////////////////////////
+
+void GpuMatrix::selectElements(Matrix& table, IVector& ids) {
+  execViaCpu2(&CpuMatrix::selectElements, *this, table, ids);
+}
+}  // namespace paddle
diff --git a/paddle/legacy/math/Matrix.h b/paddle/legacy/math/Matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..74dc690792c189db537450d9e3e6cc02f68b48ca
--- /dev/null
+++ b/paddle/legacy/math/Matrix.h
@@ -0,0 +1,2189 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <memory>
+#include <thread>
+
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/ThreadLocal.h"
+
+#include <hl_gpu.h>
+
+#include "BaseMatrix.h"
+#include "MemoryHandle.h"
+#include "Vector.h"
+#include "paddle/utils/Common.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/// TODO(tianbing), move to paddle/legacy/function/TensorType.h
+enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
+
+/**
+ * @brief  matrix sparse_format .
+ *
+ * nnz represents nonzero number in sparse matrix.
+ *
+ * SPARSE_CSR: row major matrix. length of row is height_ + 1, each element
+ * represents row start index in Matrix. length of col and value are nnz.
+ *
+ * SPARSE_CSC: col major matrix. length of col is width_ + 1, each element
+ * represents col start index in Matrix. length of col and value are nnz.
+ *
+ * @code
+ * for example: [0, 1, 0, 2, 0;
+ *               1, 0, 0, 0, 0;
+ *               0, 0, 0, 2, 5];
+ * SPARSE_CSR row   [0, 2, 3, 5];
+ *            col   [1, 3, 0, 3, 4];
+ *            value [1, 2, 1, 2, 5]
+ * SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
+ *            row   [1, 0, 0, 2, 2];
+ *            value [1, 1, 2, 2, 5]
+ * @endcode
+ */
+/// TODO(tianbing), move to paddle/legacy/function/TensorType.h
+enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
+
+class Matrix;
+class GpuMatrix;
+class CpuMatrix;
+class CpuSparseMatrix;
+class GpuSparseMatrix;
+typedef std::shared_ptr<Matrix> MatrixPtr;
+typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr;
+typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr;
+typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr;
+typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
+
+/**
+ * Copy or assignemnt constructor will share the data as opposed to making a
+ * copy of the original data. To make a copy of the orinal data, use copyFrom()
+ * instead.
+ */
+class Matrix : public BaseMatrix {
+ protected:
+  Matrix(MemoryHandlePtr memHandle,
+         size_t height,
+         size_t width,
+         bool trans,
+         bool use_gpu);
+
+  Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
+
+  Matrix(real* data,
+         size_t height,
+         size_t width,
+         size_t stride,
+         bool trans,
+         bool use_gpu);
+
+  static ThreadLocal<MatrixPtr> tmpMat_;
+
+ public:
+  size_t elementCnt_;  // maximal number of elements which can be held in data_
+  MemoryHandlePtr memoryHandle_;
+
+ public:
+  virtual ~Matrix() {}
+
+  static MatrixPtr create(MemoryHandlePtr memHandle,
+                          size_t height,
+                          size_t width,
+                          bool trans = false);
+  static MatrixPtr create(size_t height,
+                          size_t width,
+                          bool trans = false,
+                          bool useGpu = false);
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          bool trans = false,
+                          bool useGpu = false);
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          size_t stride,
+                          bool trans = false,
+                          bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
+                                      SparseValueType valueType = FLOAT_VALUE,
+                                      bool trans = false,
+                                      bool useGpu = false);
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
+                                      SparseValueType valueType = FLOAT_VALUE,
+                                      SparseFormat foramt = SPARSE_CSR,
+                                      bool trans = false,
+                                      bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(real* data,
+                                      int* row,
+                                      int* col,
+                                      size_t height,
+                                      size_t width,
+                                      size_t nnz, /* used to allocate space */
+                                      SparseValueType valueType, /*value type*/
+                                      SparseFormat format,
+                                      bool trans,
+                                      bool useGpu);
+
+  static void resizeOrCreateSparseMatrix(
+      MatrixPtr& matrix,
+      size_t height,
+      size_t width,
+      size_t nnz,
+      SparseValueType valueType = FLOAT_VALUE,
+      SparseFormat foramt = SPARSE_CSR,
+      bool trans = false,
+      bool useGpu = false);
+
+  static void resizeOrCreate(MatrixPtr& a,
+                             size_t height,
+                             size_t width,
+                             bool trans = false,
+                             bool useGpu = false);
+
+  /**
+   * @brief  set the data buffer used to hold the matrix data.
+   *
+   * caller should make sure that the size of data is at least
+   * sizeof(real)*height*width.
+   */
+  void setData(real* data) {
+    BaseMatrix::setData(data);
+    memoryHandle_.reset();
+  }
+
+  /// the data should be contiguous
+  void setData(real* data, size_t newHeight, size_t newWidth) {
+    setData(data);
+    height_ = newHeight;
+    width_ = newWidth;
+    elementCnt_ = newHeight * newWidth;
+    stride_ = width_;
+  }
+
+  size_t getWidth() const { return width_; }
+  size_t getHeight() const { return height_; }
+  size_t getStride() const { return stride_; }
+  size_t getElementCnt() const { return elementCnt_; }
+  virtual real* getData() { return data_; }
+  virtual const real* getData() const { return data_; }
+  bool isTransposed() const { return trans_; }
+  bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+
+  // If sparse matrix, need to dynamic_cast to CpuSparseMatrix/GpuSparseMatrix
+  // befor call the following functions.
+  // Declare these functions in the base class just easy to call them.
+  // And these declarations should be moved to base class of sparse matrix
+  // if refactor sparse matrix
+  virtual int* getRows() const {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;  //! suppress warning for no return value.
+  }
+
+  virtual int* getCols() const {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;  //! suppress warning for no return value.
+  }
+
+  virtual SparseFormat getFormat() const {
+    LOG(FATAL) << "Not implemented";
+    return SPARSE_CSR;  //! suppress warning for no return value.
+  }
+
+  virtual SparseValueType getValueType() const {
+    LOG(FATAL) << "Not implemented";
+    return NO_VALUE;  //! suppress warning for no return value.
+  }
+
+  /**
+   * @brief matrix elment-wise add
+   *
+   * Named add3 just because add/add2 has been used in BaseMatrix.cu
+   * and they are not virtual function.
+   */
+  virtual void add3(MatrixPtr b) { LOG(FATAL) << "Not implemented"; }
+
+  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
+
+  virtual void zeroMem() { LOG(FATAL) << "Not implemented"; }
+
+  virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
+
+  void setDiag(real value);
+
+  virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void trimFrom(const CpuSparseMatrix& src) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  // For GpuMatrix this is an asynchronous copy interface
+  // For CpuMatrix this is an synchronous copy interface
+  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  MatrixPtr subMatrix(size_t startRow,
+                      size_t endRow,
+                      size_t startCol,
+                      size_t endCol);
+
+  MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
+    return subMatrix(startRow, endRow, 0, getWidth());
+  }
+
+  MatrixPtr subColMatrix(size_t startCol, size_t endCol) {
+    return subMatrix(0, getHeight(), startCol, endCol);
+  }
+
+  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
+    CHECK_LE(startRow + numRows, getHeight());
+    return Matrix::create(getData() + startRow * getWidth(),
+                          numRows,
+                          getWidth(),
+                          trans_,
+                          useGpu_);
+  }
+  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
+    CHECK_LE(startRow + numRows, getHeight());
+    CHECK_EQ(useGpu_, dest->useGpu_);
+    dest->setData(this->rowBuf(startRow), numRows, getWidth());
+    return dest;
+  }
+
+  /**
+   * If this is GpuMatrix, src is assumed to be CPU memory
+   *
+   * If this is CpuMatrix, src is assumed to be CPU memory
+   */
+  virtual void copyFrom(const real* src, size_t size) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void copyFrom(const real* src, const int64_t* seq) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief convert a int vector to a real matrix.
+   *
+   * (1) source and dest are both in CPU.
+   *
+   * (2) sizes are exactly match.
+   */
+  virtual void copyFrom(const IVector& src) {
+    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
+  }
+
+  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief Create a matrix with the same type (GpuMatrix, CpuMatrix,
+   *        NonValueSparseMatrix, etc.) as this.
+   *
+   * If height and width is zero, the new matrix will have the same size
+   * as this, otherwise the new matrix will have the specified size.
+   *
+   */
+  virtual MatrixPtr clone(size_t height = 0,
+                          size_t width = 0,
+                          bool useGpu = false) {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  virtual real* getRowBuf(size_t row) {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  virtual real getElement(size_t x, size_t y) const {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual real getSum() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual void accumulateColSum(Matrix& src) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual real getAbsSum() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  /**
+   * @note Original data may not be preserved after resize().
+   */
+  virtual void resize(size_t newHeight, size_t newWidth) = 0;
+
+  /**
+   * @note This should only be used for sparse matrix.
+   */
+  virtual void resize(size_t newHeight,
+                      size_t newWidth,
+                      size_t newNnz, /* total item used to allocate space */
+                      SparseValueType valueType,
+                      SparseFormat format) = 0;
+
+  /**
+   * @brief This should only be used for sparse matrix.
+   *
+   * Currently must be called for each row in order.
+   * The matrix is not valid until setRow is called for the last row.
+   */
+  virtual void setRow(size_t row,
+                      size_t colNum,
+                      const unsigned int* cols,
+                      const real* values) = 0;
+
+  virtual MatrixPtr getTranspose() = 0;
+
+  /**
+   * @brief  hard transpose.
+   *
+   * allocate matTrans' memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void transpose(MatrixPtr& matTrans, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  rotate 90 degrees in clock-wise if clockWise=true;
+   *         otherwise rotate in anti clock-wise
+   * clock-wise:
+   * \f[
+   *   y(j,i) = x(M-i-1,j)
+   * \f]
+   * anti clock-wise:
+   * \f[
+   *   y(j,i) = x(i, N-1-j)
+   * \f]
+   * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
+   *
+   * allocate matRot' memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual MatrixPtr getInverse() {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  /**
+   * @brief  inverse.
+   *
+   * if allocate matInv's memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void inverse(MatrixPtr& matInv, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+ public:
+  /// Only set all variables to 0 or NULL but not free them.
+  virtual void clear() {
+    height_ = 0;
+    width_ = 0;
+    data_ = NULL;
+  }
+
+  void reshape(size_t height, size_t width);
+
+  /// add b to each sample of this.
+  virtual void addBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void addSharedBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  void addBias(Matrix& b, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      addBias(b, scale);
+    } else {
+      addSharedBias(b, scale);
+    }
+  }
+
+  /// add each sample from a to this.
+  virtual void collectBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void collectSharedBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  void collectBias(Matrix& a, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      collectBias(a, scale);
+    } else {
+      collectSharedBias(a, scale);
+    }
+  }
+
+  virtual void sequenceAvgForward(Matrix& a,
+                                  const IVector& startsPos,
+                                  int mode) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void sequenceAvgBackward(Matrix& a,
+                                   const IVector& startsPos,
+                                   int mode) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = scaleAB*(a*b) + scaleT*this
+   * @endcode
+   */
+  virtual void mul(const Matrix& a,
+                   const Matrix& b,
+                   real scaleAB,
+                   real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// Add a vector (column) b to matrix a, column by column.
+  virtual void addColumnVector(const Matrix& b) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   this(i, j) += vec(index(i, j), 0)
+   * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
+   * @endcode
+   */
+  virtual void addByBitCode(size_t numClasses,
+                            const IVector& codes,
+                            const Matrix& vec) {
+    (void)numClasses;
+    (void)codes;
+    (void)vec;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   vec(index(i, j), 0) += this(i, j)
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void addByBitCodeBackward(size_t numClasses,
+                                    const IVector& codes,
+                                    Matrix& vec) {
+    (void)numClasses;
+    (void)codes;
+    (void)vec;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   this(i, j) += <mat.row(index(i, j)), input.row(i)>
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCode(size_t numClasses,
+                            const IVector& codes,
+                            const Matrix& mat,
+                            const Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   mat.row(index(i, j)) += this(i, j) * input.row(i)
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCodeBackwardWeight(size_t numClasses,
+                                          const IVector& codes,
+                                          Matrix& mat,
+                                          const Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   input.row(i) += this(i, j) * mat.row(index(i, j))
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCodeBackwardError(size_t numClasses,
+                                         const IVector& codes,
+                                         const Matrix& mat,
+                                         Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength
+   *   sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
+   * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
+   * @endcode
+   */
+  virtual void sumByBitCode(size_t numClasses,
+                            IVector& codes,
+                            Matrix& sum,
+                            real scaleSum) {
+    (void)numClasses;
+    (void)codes;
+    (void)sum;
+    (void)scaleSum;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength
+   *  this(i, j) -= bit(i, j)
+   * where bit(i, j) is same as that for sumByBitCode
+   * @endcode
+   */
+  virtual void subByBitCode(size_t numClasses_, IVector& codes) {
+    (void)numClasses_;
+    (void)codes;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * add the sum of each row of this to mat
+   */
+  virtual void rowSum(Matrix& sum) {
+    (void)sum;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * set the max of each row of this to mat
+   */
+  virtual void rowMax(Matrix& max) {
+    (void)max;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * set the max of each column of this to mat
+   */
+  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
+
+  /**
+   * @brief Get the top k elements of each column of this matrix.
+   *
+   * The row ids and values of these elements are stored in
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
+   */
+  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void maxoutForward(Matrix& a,
+                             IVector& id,
+                             size_t channels,
+                             size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void maxoutBackward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
+                              size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief Get the top k elements of each row of this matrix.
+   *
+   * The column ids and values of these elements are stored in
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
+   */
+  virtual void rowMax(IVector& maxIds, Matrix& max) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// normalize each row so that the sum of each row is 1.
+  virtual void rowNormalizeL1(Matrix& out) {
+    (void)out;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   *  this = a*b
+   * @endcode
+   */
+  virtual void mul(const Matrix& a, const Matrix& b) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = scaleAB*(this*b) +  scaleT*this
+   * @endcode
+   */
+  virtual void rightMul(Matrix& b, real scaleAB, real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = this* b
+   * @endcode
+   */
+  virtual void rightMul(Matrix& b) { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @code
+   * this = scaleAB*(a*this) +  scaleT*this
+   * @endcode
+   */
+  virtual void leftMul(Matrix& a, real scaleAB, real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = a*this)
+   * @endcode
+   */
+  virtual void leftMul(Matrix& a) { LOG(FATAL) << "Not implemented"; }
+
+  /// merge the element for each col.
+  virtual void colMerge(Matrix& src) { LOG(FATAL) << "Not implemented"; }
+
+  /// copy -log(output[label]) to this->data[i].
+  virtual void oneHotCrossEntropy(Matrix& output, IVector& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the error of outputV according to label.
+  virtual void oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// copy -log(output[label]) to this->data[i].
+  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                              IVector& label,
+                                              real alpha) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the error of outputV according to label.
+  virtual void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                                IVector& label,
+                                                real alpha) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * \f[
+   *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
+   * \f]
+   *
+   * b contains M elements,
+   * c contains N elements (N is odd),
+   * b's index arithmetic is computed modulo M,
+   * c's index arithmetic is computed modulo N.
+   */
+  virtual void circularConv(Matrix& b, Matrix& c) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void circularConvDerivative(Matrix& output,
+                                      Matrix& prevOut1,
+                                      Matrix& prevOut2,
+                                      Matrix& prevGrad1,
+                                      Matrix& prevGrad2) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
+  virtual void softmax(Matrix& output) {
+    (void)output;
+    LOG(FATAL) << "Not implemeted";
+  }
+  virtual void sequenceSoftmax(Matrix& output, const IVector& index) {
+    (void)output;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void softmaxBackward(Matrix& outputV) {
+    (void)outputV;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /*
+    sum_i = sum_j this_ij * output_ij
+    this_ij = output_ij* (this_ij - sum_i)
+  */
+  virtual void softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the sum of squares diff cost.
+  virtual void sumOfSquares(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// gradient of sumOfSquares.
+  virtual void sumOfSquaresBp(Matrix& outputV, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void smoothL1(Matrix& output, Matrix& label, real destScale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void smoothL1Bp(Matrix& outputV, Matrix& label, real destScale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void tanhDerivative(Matrix& output) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void softrelu(Matrix& output) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void softreluDerivative(Matrix& output) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void scaledTanh(Matrix& output, real p1, real p2) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// print out the values of elements to os
+  virtual void print(std::ostream& os) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * print a part of the matrix
+   * from the (top,left) value to the (height, width) value (not included)
+   */
+  virtual void print(std::ostream& os, size_t height, size_t width) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// print one row to os
+  virtual void printOneRow(std::ostream& os, size_t idx) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void check(std::ostream& os, Matrix& refMat, bool printDiff = true) {}
+
+  virtual real getMin() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+  virtual real getMax() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual void randomizeUniform() { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief  calulate the error of classification
+   *
+   * output[i] = 1 if row i is an error.
+   *
+   * output[i] = 0 if row i is correct.
+   *
+   */
+  virtual void classificationError(Matrix& output,
+                                   IVector& label,
+                                   size_t topkSize = 1) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void upsampleForward(Matrix& input,
+                               Matrix& mask,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t outputH,
+                               size_t outputW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void upsampleBackward(Matrix& outputGrad,
+                                Matrix& mask,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * Pooling forward operation, pick out the largest element
+   * in the sizeX of value, if the maskMatP is not NULL, it will
+   * also caculate the location indices.
+   */
+  virtual void maxPoolForward(Matrix& inputMat,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW,
+                              MatrixPtr maskMatP = NULL) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /// Pooling backward operation.
+  virtual void maxPoolBackward(Matrix& image,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               Matrix& outGrad,
+                               Matrix& outV,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /// Pooling forward operation, caculate the average of sizeX elements.
+  virtual void avgPoolForward(Matrix& input,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW,
+                              bool excludeMode = true) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPoolBackward(Matrix& input,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW,
+                               bool excludeMode = true) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * Pooling 3D forward operation, pick out the largest element
+   * in the sizeX of value
+   */
+  virtual void maxPool3DForward(Matrix& inputMat,
+                                Matrix& maxPoolIdx,
+                                size_t channels,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void maxPool3DBackward(Matrix& outGrad,
+                                 Matrix& maxPoolIdx,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW,
+                                 real scaleTargets,
+                                 real scaleOutput) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DForward(Matrix& input,
+                                size_t channels,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DBackward(Matrix& input,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW,
+                                 real scaleTargets,
+                                 real scaleOutput) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+ * Input: one or more sequences. Each sequence contains some instances.
+ *
+ * Output: output size is the number of input sequences (NOT input
+ * instances).
+ *
+ * output[i] is set to max_input[i].
+ */
+  virtual void maxSequenceForward(Matrix& input,
+                                  const IVector& sequence,
+                                  IVector& index) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void maxSequenceBackward(Matrix& outputGrad,
+                                   const IVector& sequence,
+                                   IVector& index) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * if ids[i] == -1, it will be ignored
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids) {
+    (void)table;
+    (void)ids;
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * if ids[i] == -1, it will be ignored
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids) {
+    (void)table;
+    (void)ids;
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * table[i, id[i]] += this[i]
+   * @endcode
+   */
+  virtual void addElements(Matrix& table, IVector& ids) {
+    LOG(FATAL) << "Not implemented";
+  }
+  /**
+   * @brief  cross entropy for multi binary labels
+   *
+   * @code
+   * this[i] = -sum(label[i][j]*log(output[i][j])
+   *           + (1-label[i][j])*log(1-output[i][j]))
+   * @endcode
+   */
+  virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  The gradient of cross entropy for multi binary labels on output
+   *
+   * @code
+   * this[i][j] = -label[i][j]/output[i][j]
+   *              + (1-label[i][j])/(1-output[i][j])
+   * @endcode
+   */
+  virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  Calculate the classification error for multi binary labels
+   *
+   * @code
+   * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
+   *            || (output[i][j] < threshold && label[i][j] == 1))
+   *            / output->getWidth()
+   * @endcode
+   */
+  virtual void classificationErrorMulti(Matrix& output,
+                                        Matrix& label,
+                                        real threshold) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void paramReluForward(Matrix& data, Matrix& W) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void paramReluBackwardW(Matrix& oGrad, Matrix& data) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void vol2Col(real* data,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void col2Vol(real* trg,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       real alpha,
+                       real beta) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void bilinearForward(const Matrix& in,
+                               const size_t inImgH,
+                               const size_t inImgW,
+                               const size_t outImgH,
+                               const size_t outImgW,
+                               const size_t numChannels,
+                               const real ratioH,
+                               const real ratioW) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void bilinearBackward(const Matrix& out,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<real>(*this, expr);
+    } else {
+      TensorCpuApply<real>(*this, expr);
+    }
+  }
+
+  bool isEmpty() const { return data_ == nullptr; }
+
+  explicit operator bool() const { return !isEmpty(); }
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
+  mat.print(os);
+  return os;
+}
+
+class GpuMatrix : public Matrix {
+ public:
+  GpuMatrix();
+
+  GpuMatrix(size_t height, size_t width, bool trans = false);
+  GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
+      : Matrix(data, height, width, trans, true) {}
+  GpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
+            bool trans = false)
+      : Matrix(data, height, width, stride, trans, true) {}
+  GpuMatrix(GpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
+            bool trans = false)
+      : Matrix(dataHandle, height, width, trans, true) {}
+  ~GpuMatrix();
+
+  void zeroMem();
+  void resetOne();
+  void setDiag(real value);
+
+  void resize(size_t newHeight, size_t newWidth);
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+
+  /**
+   * Copy the data from cpu_memory buffer
+   */
+  void copyFrom(const real* hostSrc, size_t size);
+
+  void copyFrom(const real* hostSrc, const int64_t* seq);
+
+  void copyFrom(const Matrix& src, hl_stream_t stream);
+
+  void copyFrom(const Matrix& src);
+
+  void copyFrom(const IVector& src);
+
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
+
+  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
+
+  real getElement(size_t x, size_t y) const;
+
+  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  real getSum();
+  void accumulateColSum(Matrix& src);
+  real getAbsSum();
+
+  real getMin();
+  real getMax();
+
+  MatrixPtr getTranspose();
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
+
+  MatrixPtr getInverse();
+  void inverse(MatrixPtr& matInv, bool memAlloc);
+
+  /// add b to each sample of this.
+  void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
+
+  /**
+   * @code
+   * add each sample from a to this.
+   * @endcode
+   */
+  void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
+
+  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids);
+
+  void addColumnVector(const Matrix& b);
+
+  /**
+   * @code
+   * this = scaleAB*(a*b) + scaleT*this
+   * @endcode
+   */
+  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = a*b
+   * @endcode
+   */
+  void mul(const Matrix& a, const Matrix& b);
+
+  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
+
+  void mul(const GpuSparseMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT);
+
+  void mul(const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT);
+
+  /**
+   * @code
+   * this = scaleAB*(this*b) +  scaleT*this
+   * @endcode
+   */
+  void rightMul(Matrix& b, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = this* b
+   * @endcode
+   */
+  void rightMul(Matrix& b);
+
+  /**
+   * @code
+   * this = scaleAB*(a*this) +  scaleT*this
+   * @endcode
+   */
+  void leftMul(Matrix& a, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = a*this
+   * @endcode
+   */
+  void leftMul(Matrix& a);
+
+  void colMerge(Matrix& src);
+  void rowSum(Matrix& sum);
+  void rowMax(Matrix& max);
+  void rowMax(IVector& maxIds, Matrix& max);
+  void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& max);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
+
+  void oneHotCrossEntropy(Matrix& output, IVector& label);
+  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
+                                      real alpha);
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
+                                        real alpha);
+
+  void softmax(Matrix& output);
+  void sequenceSoftmax(Matrix& output, const IVector& index);
+  void softmaxBackward(Matrix& outputV);
+  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
+
+  /// calculate the sum of squares diff cost.
+  void sumOfSquares(Matrix& output, Matrix& label);
+
+  /// gradient of sumOfSquares.
+  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
+  void tanh(Matrix& output);
+  void tanhDerivative(Matrix& output);
+  void softrelu(Matrix& output);
+  void softreluDerivative(Matrix& output);
+  void scaledTanh(Matrix& output, real p1, real p2);
+
+  virtual void print(std::ostream& os) const;
+  virtual void print(std::ostream& os, size_t height, size_t width) const;
+
+  void paramReluForward(Matrix& data, Matrix& W);
+  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
+  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
+
+  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
+  void randomizeUniform();
+
+  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
+
+  void upsampleForward(Matrix& input,
+                       Matrix& mask,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t channels,
+                       size_t outputH,
+                       size_t outputW);
+
+  void upsampleBackward(Matrix& outputGrad,
+                        Matrix& mask,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t outputH,
+                        size_t outputW);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW,
+                      MatrixPtr maskMatP);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW,
+                      bool excludeMode = true);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW,
+                       bool excludeMode = true);
+
+  void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
+                          IVector& index);
+
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
+                           IVector& index);
+
+  void bilinearForward(const Matrix& in,
+                       const size_t inImgH,
+                       const size_t inImgW,
+                       const size_t outImgH,
+                       const size_t outImgW,
+                       const size_t numChannels,
+                       const real ratioH,
+                       const real ratioW);
+
+  void bilinearBackward(const Matrix& out,
+                        const size_t outImgH,
+                        const size_t outImgW,
+                        const size_t inImgH,
+                        const size_t inImgW,
+                        const size_t numChannels,
+                        const real ratioH,
+                        const real ratioW);
+
+  void vol2Col(real* data,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW);
+
+  void col2Vol(real* trg,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW,
+               real alpha,
+               real beta);
+
+  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
+
+  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<real>(*this, expr);
+  }
+};
+
+class CpuMatrix : public Matrix {
+ private:
+  MatrixPtr sftmaxSum_;
+  MatrixPtr sftmaxDot_;
+
+ public:
+  CpuMatrix(size_t height, size_t width, bool trans = false);
+  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
+      : Matrix(data, height, width, trans, false) {}
+  CpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
+            bool trans = false)
+      : Matrix(data, height, width, stride, trans, false) {}
+
+  CpuMatrix(CpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
+            bool trans = false)
+      : Matrix(dataHandle, height, width, trans, false) {}
+
+  ~CpuMatrix();
+
+  void zeroMem();
+  void resetOne();
+  void setDiag(real value);
+
+  void resize(size_t newHeight, size_t newWidth);
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+
+  real getElement(size_t x, size_t y) const;
+  real getSum();
+  void accumulateColSum(Matrix& src);
+  real getAbsSum();
+
+  MatrixPtr getTranspose();
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
+
+  MatrixPtr getInverse();
+  void inverse(MatrixPtr& matInv, bool memAlloc);
+
+  void copyFrom(const Matrix& src);
+
+  void copyFrom(const Matrix& src, hl_stream_t stream);
+
+  void copyFrom(const real* cpuSrc, size_t size);
+
+  void copyFrom(const real* cpuSrc, const int64_t* seq);
+
+  void copyFrom(const IVector& src);
+
+  void copyFrom(CpuSparseMatrix& src);
+
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
+
+  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
+
+  void upsampleForward(Matrix& input,
+                       Matrix& mask,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t channels,
+                       size_t outputH,
+                       size_t outputW);
+
+  void upsampleBackward(Matrix& outputGrad,
+                        Matrix& mask,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t outputH,
+                        size_t outputW);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW,
+                      MatrixPtr maskMatP);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW,
+                      bool excludeMode = true);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW,
+                       bool excludeMode = true);
+
+  void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
+                          IVector& index);
+
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
+                           IVector& index);
+
+  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+ public:
+  /// add b to each sample of this.
+  void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
+
+  /// add each sample of a to this.
+  void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
+
+  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table[i, id[i]] += this[i]
+   * @endcode
+   */
+  virtual void addElements(Matrix& table, IVector& ids);
+
+  /**
+   * use abstract getRow() to get row from table.
+   *
+   * Define table as template instead of virtual class for performance sake.
+   * internal used by above two virtual funcs.
+   */
+  template <typename TableMatType>
+  void selectRowsImp(TableMatType& table, IVector& ids);
+  template <typename TableMatType>
+  void addToRowsImp(TableMatType& table, IVector& ids);
+
+  void addColumnVector(const Matrix& b);
+
+  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
+  void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
+
+  static void mul(CpuMatrix* a,
+                  CpuMatrix* b,
+                  CpuSparseMatrix* c,
+                  real scaleAB,
+                  real scaleT);
+
+  /**
+   * c = a * b
+   *
+   * use abstract getRow() to get row from B,C.
+   * Define B,C as template instead of virtual class for performance sake.
+   */
+  template <typename MatBType, typename MatCType>
+  static void mul(
+      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  void mul(const Matrix& a, const Matrix& b);
+
+  void rightMul(Matrix& b, real scaleAB, real scaleT);
+  void rightMul(Matrix& b);
+
+  void leftMul(Matrix& a, real scaleAB, real scaleT);
+  void leftMul(Matrix& a);
+  void colMerge(Matrix& src);
+  void rowSum(Matrix& sum);
+  void rowMaxId(IVector& maxIds);
+  void rowMax(Matrix& max);
+  void rowMax(IVector& maxIds, Matrix& maxVal);
+  void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& maxVal);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void rowNormalizeL1(Matrix& out);
+
+  void oneHotCrossEntropy(Matrix& output, IVector& label);
+  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
+                                      real alpha);
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
+                                        real alpha);
+
+  void circularConv(Matrix& b, Matrix& c);
+  void circularConvDerivative(Matrix& output,
+                              Matrix& prevOut1,
+                              Matrix& prevOut2,
+                              Matrix& prevGrad1,
+                              Matrix& prevGrad2);
+
+  void softmax(Matrix& output);
+  void sequenceSoftmax(Matrix& output, const IVector& index);
+  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
+
+  /// calculate the sum of squares diff cost.
+  void sumOfSquares(Matrix& output, Matrix& label);
+
+  /// gradient of sumOfSquares.
+  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
+
+  void smoothL1(Matrix& output, Matrix& label, real destScale);
+  void smoothL1Bp(Matrix& output, Matrix& label, real destScale);
+
+  void tanh(Matrix& output);
+  void tanhDerivative(Matrix& output);
+
+  void softrelu(Matrix& output);
+  void softreluDerivative(Matrix& output);
+  void scaledTanh(Matrix& output, real p1, real p2);
+
+  void print(std::ostream& os) const;
+  void print(std::ostream& os, size_t height, size_t width) const;
+  void printOneRow(std::ostream& os, size_t idx) const;
+
+  void paramReluForward(Matrix& data, Matrix& W);
+  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
+  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
+
+  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
+
+  real getMin();
+  real getMax();
+
+  void randomizeUniform();
+
+  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
+
+  void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
+
+  void addByBitCodeBackward(size_t numClasses,
+                            const IVector& codes,
+                            Matrix& vec);
+
+  void mulByBitCode(size_t numClasses,
+                    const IVector& codes,
+                    const Matrix& mat,
+                    const Matrix& input);
+
+  void mulByBitCodeBackwardWeight(size_t numClasses,
+                                  const IVector& codes,
+                                  Matrix& mat,
+                                  const Matrix& input);
+
+  void mulByBitCodeBackwardError(size_t numClasses,
+                                 const IVector& codes,
+                                 const Matrix& mat,
+                                 Matrix& input);
+
+  void sumByBitCode(size_t numClasses,
+                    IVector& codes,
+                    Matrix& sum,
+                    real scaleSum);
+
+  void subByBitCode(size_t numClasses_, IVector& codes);
+
+  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
+  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+  void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
+
+  void bilinearForward(const Matrix& in,
+                       const size_t inImgH,
+                       const size_t inImgW,
+                       const size_t outImgH,
+                       const size_t outImgW,
+                       const size_t numChannels,
+                       const real ratioH,
+                       const real ratioW);
+
+  void bilinearBackward(const Matrix& out,
+                        const size_t outImgH,
+                        const size_t outImgW,
+                        const size_t inImgH,
+                        const size_t inImgW,
+                        const size_t numChannels,
+                        const real ratioH,
+                        const real ratioW);
+
+  void vol2Col(real* data,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW);
+
+  void col2Vol(real* trg,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW,
+               real alpha,
+               real beta);
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<real>(*this, expr);
+  }
+};
+
+class SharedCpuMatrix : public CpuMatrix {
+ public:
+#ifndef PADDLE_MOBILE_INFERENCE
+  /* blockNum is number of partitions of the matrix  */
+  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
+      : CpuMatrix(height, width, trans) {
+    initShared(blockNum);
+  }
+  SharedCpuMatrix(
+      int blockNum, real* data, size_t height, size_t width, bool trans = false)
+      : CpuMatrix(data, height, width, trans) {
+    initShared(blockNum);
+  }
+
+  SharedCpuMatrix(int blockNum,
+                  CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {
+    initShared(blockNum);
+  }
+
+  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {
+    initBlock(1);
+  }
+
+  ~SharedCpuMatrix() {}
+
+ public:
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+  virtual void add(Matrix& b, real p1, real p2);
+  virtual void add(real p1, real p2);
+
+ private:
+  using Matrix::mul;
+  void initShared(int blockNum);
+  void initBlock(int blockNum);
+
+  int blockNum_;
+  std::vector<std::unique_ptr<std::mutex>> blockLocks_;
+  ThreadLocal<CpuMatrixPtr> localBuf_;
+  ThreadLocal<std::vector<int>> localBufRows_;
+  ThreadLocal<std::vector<int>> blockSeq_;
+#endif
+};
+
+typedef struct { unsigned int col; } sparse_non_value_t;
+
+typedef struct {
+  unsigned int col;
+  float value;
+} sparse_float_value_t;
+
+}  // namespace paddle
+#include "ExecViaCpu.h"
diff --git a/paddle/math/MatrixBitCode.cpp b/paddle/legacy/math/MatrixBitCode.cpp
similarity index 100%
rename from paddle/math/MatrixBitCode.cpp
rename to paddle/legacy/math/MatrixBitCode.cpp
diff --git a/paddle/math/MemoryHandle.cpp b/paddle/legacy/math/MemoryHandle.cpp
similarity index 100%
rename from paddle/math/MemoryHandle.cpp
rename to paddle/legacy/math/MemoryHandle.cpp
diff --git a/paddle/math/MemoryHandle.h b/paddle/legacy/math/MemoryHandle.h
similarity index 100%
rename from paddle/math/MemoryHandle.h
rename to paddle/legacy/math/MemoryHandle.h
diff --git a/paddle/math/NEONFunctions.cpp b/paddle/legacy/math/NEONFunctions.cpp
similarity index 100%
rename from paddle/math/NEONFunctions.cpp
rename to paddle/legacy/math/NEONFunctions.cpp
diff --git a/paddle/math/NEONFunctions.h b/paddle/legacy/math/NEONFunctions.h
similarity index 100%
rename from paddle/math/NEONFunctions.h
rename to paddle/legacy/math/NEONFunctions.h
diff --git a/paddle/math/PoolAllocator.cpp b/paddle/legacy/math/PoolAllocator.cpp
similarity index 100%
rename from paddle/math/PoolAllocator.cpp
rename to paddle/legacy/math/PoolAllocator.cpp
diff --git a/paddle/math/PoolAllocator.h b/paddle/legacy/math/PoolAllocator.h
similarity index 100%
rename from paddle/math/PoolAllocator.h
rename to paddle/legacy/math/PoolAllocator.h
diff --git a/paddle/math/RowBuffer.h b/paddle/legacy/math/RowBuffer.h
similarity index 100%
rename from paddle/math/RowBuffer.h
rename to paddle/legacy/math/RowBuffer.h
diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/legacy/math/SIMDFunctions.cpp
similarity index 100%
rename from paddle/math/SIMDFunctions.cpp
rename to paddle/legacy/math/SIMDFunctions.cpp
diff --git a/paddle/math/SIMDFunctions.h b/paddle/legacy/math/SIMDFunctions.h
similarity index 100%
rename from paddle/math/SIMDFunctions.h
rename to paddle/legacy/math/SIMDFunctions.h
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/legacy/math/SparseMatrix.cpp
similarity index 100%
rename from paddle/math/SparseMatrix.cpp
rename to paddle/legacy/math/SparseMatrix.cpp
diff --git a/paddle/math/SparseMatrix.h b/paddle/legacy/math/SparseMatrix.h
similarity index 100%
rename from paddle/math/SparseMatrix.h
rename to paddle/legacy/math/SparseMatrix.h
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/legacy/math/SparseRowMatrix.cpp
similarity index 100%
rename from paddle/math/SparseRowMatrix.cpp
rename to paddle/legacy/math/SparseRowMatrix.cpp
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/legacy/math/SparseRowMatrix.h
similarity index 100%
rename from paddle/math/SparseRowMatrix.h
rename to paddle/legacy/math/SparseRowMatrix.h
diff --git a/paddle/math/Storage.cpp b/paddle/legacy/math/Storage.cpp
similarity index 100%
rename from paddle/math/Storage.cpp
rename to paddle/legacy/math/Storage.cpp
diff --git a/paddle/math/Storage.h b/paddle/legacy/math/Storage.h
similarity index 100%
rename from paddle/math/Storage.h
rename to paddle/legacy/math/Storage.h
diff --git a/paddle/math/TensorApply.h b/paddle/legacy/math/TensorApply.h
similarity index 100%
rename from paddle/math/TensorApply.h
rename to paddle/legacy/math/TensorApply.h
diff --git a/paddle/math/TensorAssign.h b/paddle/legacy/math/TensorAssign.h
similarity index 100%
rename from paddle/math/TensorAssign.h
rename to paddle/legacy/math/TensorAssign.h
diff --git a/paddle/math/TensorEvaluate.h b/paddle/legacy/math/TensorEvaluate.h
similarity index 100%
rename from paddle/math/TensorEvaluate.h
rename to paddle/legacy/math/TensorEvaluate.h
diff --git a/paddle/math/TensorExpression.h b/paddle/legacy/math/TensorExpression.h
similarity index 100%
rename from paddle/math/TensorExpression.h
rename to paddle/legacy/math/TensorExpression.h
diff --git a/paddle/math/TrainingAlgorithmOp.cu b/paddle/legacy/math/TrainingAlgorithmOp.cu
similarity index 100%
rename from paddle/math/TrainingAlgorithmOp.cu
rename to paddle/legacy/math/TrainingAlgorithmOp.cu
diff --git a/paddle/math/TrainingAlgorithmOp.h b/paddle/legacy/math/TrainingAlgorithmOp.h
similarity index 100%
rename from paddle/math/TrainingAlgorithmOp.h
rename to paddle/legacy/math/TrainingAlgorithmOp.h
diff --git a/paddle/math/Vector.cpp b/paddle/legacy/math/Vector.cpp
similarity index 100%
rename from paddle/math/Vector.cpp
rename to paddle/legacy/math/Vector.cpp
diff --git a/paddle/math/Vector.h b/paddle/legacy/math/Vector.h
similarity index 100%
rename from paddle/math/Vector.h
rename to paddle/legacy/math/Vector.h
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/legacy/math/tests/CMakeLists.txt
similarity index 100%
rename from paddle/math/tests/CMakeLists.txt
rename to paddle/legacy/math/tests/CMakeLists.txt
diff --git a/paddle/legacy/math/tests/OriginalOptimizerApi.h b/paddle/legacy/math/tests/OriginalOptimizerApi.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f942e28f47832a25d5aa00f80f83eb5a6f5210f
--- /dev/null
+++ b/paddle/legacy/math/tests/OriginalOptimizerApi.h
@@ -0,0 +1,201 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/utils/GlobalConstants.h"
+
+using namespace paddle;  // NOLINT
+
+void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
+                                      real alpha,
+                                      real beta,
+                                      real gamma,
+                                      real tau,
+                                      real learningRate) {
+  vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
+                                   -alpha * gamma * learningRate);
+  vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
+                                   tau * alpha * gamma * learningRate);
+  vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
+                             tau / beta + 1.0 / alpha,
+                             *vecs[PARAMETER_MOMENTUM_VT],
+                             1.0 / beta);
+}
+
+void AdagradParameterOptimizer(const VectorPtr vecs[],
+                               real epsilon,
+                               real learningRate,
+                               real momentum,
+                               real decayRate) {
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
+                                                1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
+                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
+  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
+                                real rou,
+                                real epsilon,
+                                real learningRate,
+                                real momentum,
+                                real decayRate) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
+
+  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
+  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
+                                        epsilon,
+                                        epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->sqrt2();
+
+  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
+      *vecs[PARAMETER_GRADIENT],
+      *vecs[PARAMETER_LEARNING_RATE],
+      rou,
+      1.0f - rou);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void RMSPropParameterOptimizer(const VectorPtr vecs[],
+                               real accumulatedRou,
+                               real rou,
+                               real epsilon,
+                               real learningRate,
+                               real momentum,
+                               real decayRate,
+                               bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
+
+  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
+  // Basiclly if the sign of the gradient changes more often,
+  // the learning rate will be decreased.
+  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                           -1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
+                                      real accumulatedRou,
+                                      real rou,
+                                      real epsilon,
+                                      real learningRate,
+                                      real momentum,
+                                      real decayRate,
+                                      bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+  // Basiclly if the bigger the magnitude gradient is,
+  // the smaller the learning rate will be.
+  vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void AdamParameterOptimizer(const VectorPtr vecs[],
+                            real beta1,
+                            real beta2,
+                            real beta1_power,
+                            real beta2_power,
+                            real epsilon,
+                            real learningRate) {
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1, 1 - beta1);
+
+  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
+  g->square2();
+  v->add(*g, beta2, 1 - beta2);
+
+  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
+  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
+  g->sqrt2(*v);
+  g->dotDiv(*m, *g, 0., epsilon);
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  theta->add(*theta, 1.0, *g, -alpha);
+}
+
+void AdamaxParameterOptimizer(
+    const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1, 1 - beta1);
+
+  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  u->mulScalar(beta2);
+  g->abs2();
+  u->max2(*u, *g);
+
+  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  g->dotDiv(*m, *u);
+  real learningRate = alpha / (1 - std::pow(beta1, step));
+  theta->add(*theta, 1.0, *g, -learningRate);
+}
diff --git a/paddle/math/tests/PerfUtils.h b/paddle/legacy/math/tests/PerfUtils.h
similarity index 100%
rename from paddle/math/tests/PerfUtils.h
rename to paddle/legacy/math/tests/PerfUtils.h
diff --git a/paddle/legacy/math/tests/TensorCheck.h b/paddle/legacy/math/tests/TensorCheck.h
new file mode 100644
index 0000000000000000000000000000000000000000..41c8ece282e05f55d063e6ad0d8805629c847d34
--- /dev/null
+++ b/paddle/legacy/math/tests/TensorCheck.h
@@ -0,0 +1,216 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/**
+ * This file provides a TensorCheck template function, which can be used to
+ * compare CpuMatrix and GpuMatrix, CpuVector and GpuVector, and so on.
+ */
+
+#include <cmath>
+#include "paddle/legacy/math/Matrix.h"
+
+namespace autotest {
+
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::VectorT;
+using paddle::CpuVectorT;
+using paddle::GpuVectorT;
+
+class AssertEqual {
+ public:
+  AssertEqual(real err = 0) : err_(err) {}
+
+  inline bool operator()(real a, real b) {
+    if (err_ == 0) {
+      if (a != b) {
+        return false;
+      }
+    } else {
+      if (std::fabs(a - b) > err_) {
+        if ((std::fabs(a - b) / std::fabs(a)) > (err_ / 10.0f)) {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  real err_;
+};
+
+template <typename Tensor>
+class CopyToCpu;
+
+template <>
+class CopyToCpu<CpuMatrix> {
+ public:
+  explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
+  const CpuMatrix& copiedArg() const { return arg_; }
+
+ private:
+  const CpuMatrix& arg_;
+};
+
+template <>
+class CopyToCpu<GpuMatrix> {
+ public:
+  explicit CopyToCpu(const GpuMatrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+
+ private:
+  CpuMatrix arg_;
+};
+
+template <>
+class CopyToCpu<Matrix> {
+ public:
+  explicit CopyToCpu(const Matrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+
+ private:
+  CpuMatrix arg_;
+};
+
+template <typename T>
+class CopyToCpu<CpuVectorT<T>> {
+ public:
+  explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
+  const CpuVectorT<T>& copiedArg() const { return arg_; }
+
+ private:
+  const CpuVectorT<T>& arg_;
+};
+
+template <typename T>
+class CopyToCpu<GpuVectorT<T>> {
+ public:
+  explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+
+ private:
+  CpuVectorT<T> arg_;
+};
+
+template <typename T>
+class CopyToCpu<VectorT<T>> {
+ public:
+  explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+
+ private:
+  CpuVectorT<T> arg_;
+};
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare,
+                 const CpuMatrix& matrix1,
+                 const CpuMatrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (!compare(a, b)) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+template <typename AssertEq, class T>
+void TensorCheck(AssertEq compare,
+                 const CpuVectorT<T>& vector1,
+                 const CpuVectorT<T>& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+
+  const T* data1 = vector1.getData();
+  const T* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (!compare(a, b)) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+}
+
+template <typename AssertEq, typename Tensor1, typename Tensor2>
+void TensorCheck(AssertEq compare,
+                 const Tensor1& tensor1,
+                 const Tensor2& tensor2) {
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare, real args1, real args2) {
+  EXPECT_EQ(compare(args1, args2), true) << "[Test error] args1 = " << args1
+                                         << ", args2 = " << args2;
+}
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare, size_t args1, size_t args2) {
+  EXPECT_EQ(args1, args2) << "[Test error] args1 = " << args1
+                          << ", args2 = " << args2;
+}
+
+template <typename Tensor1, typename Tensor2>
+void TensorCheckEqual(const Tensor1& tensor1, const Tensor2& tensor2) {
+  AssertEqual compare(0);
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+template <typename Tensor1, typename Tensor2>
+void TensorCheckErr(const Tensor1& tensor1, const Tensor2& tensor2) {
+#ifndef PADDLE_TYPE_DOUBLE
+  AssertEqual compare(1e-3);
+#else
+  AssertEqual compare(1e-10);
+#endif
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+}  // namespace autotest
diff --git a/paddle/legacy/math/tests/TestUtils.h b/paddle/legacy/math/tests/TestUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..60e76359da61ac32346b093d9a9ff69104bfc494
--- /dev/null
+++ b/paddle/legacy/math/tests/TestUtils.h
@@ -0,0 +1,294 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/**
+ * This file provides a AutoCompare calss to simplify the comparison
+ * of CPU and GPU member functions.
+ *
+ * This takes two steps
+ * 1. Construct an AutoCompare object.
+ *    When constructing an AutoCompare object, you can set the err argument
+ * to specify the maximum error for CPU and GPU functions.
+ *
+ * 2. Use the template functions cmpWithArg or cmpWithoutArg.
+ * A. [cmpWithArg] Requires the caller construct the cpu arguments.
+ *
+ *  AutoCompare test;
+ *  Init Argument arg1,arg2...
+ *  test.cmpWithArg(function, arg1, arg2....)
+ *
+ * B. [cmpWithoutArg] The caller do not need construct arguments.
+ *    If matrix used in these functions arguments is the same size.
+ *    Such as the element wise function and the aggregate function
+ *    defined in the BaseMatrix.cpp.
+ *
+ *  AutoCompare test;
+ *  test.cmpWithoutArg<I...>(function, height, width)
+ */
+
+#include <gtest/gtest.h>
+#include "TensorCheck.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace autotest {
+
+using paddle::BaseMatrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::CpuIVector;
+using paddle::GpuIVector;
+using paddle::CpuSparseMatrix;
+using paddle::GpuSparseMatrix;
+
+template <typename T1, typename T2>
+class ReplaceType {
+ public:
+  typedef T1 type;
+};
+
+template <>
+class ReplaceType<BaseMatrix, CpuMatrix> {
+ public:
+  typedef CpuMatrix type;
+};
+
+template <>
+class ReplaceType<BaseMatrix, GpuMatrix> {
+ public:
+  typedef GpuMatrix type;
+};
+
+template <>
+class ReplaceType<Matrix, CpuMatrix> {
+ public:
+  typedef CpuMatrix type;
+};
+
+template <>
+class ReplaceType<Matrix, GpuMatrix> {
+ public:
+  typedef GpuMatrix type;
+};
+
+// construct a argument
+template <typename T>
+T construct(int height, int width);
+
+template <>
+float construct(int height, int width) {
+  return 0.5;
+}
+
+template <>
+double construct(int height, int width) {
+  return 0.5;
+}
+
+template <>
+size_t construct(int height, int width) {
+  size_t offset = std::rand() % (height < width ? height : width);
+  return offset;
+}
+
+template <>
+CpuMatrix construct(int height, int width) {
+  CpuMatrix a(height, width);
+  return a;
+}
+
+template <>
+GpuMatrix construct(int height, int width) {
+  GpuMatrix a(height, width);
+  return a;
+}
+
+// init a argument
+template <typename T>
+void init(T& v) {
+  return;
+}
+
+template <>
+void init(CpuMatrix& v) {
+  v.randomizeUniform();
+}
+
+template <>
+void init(GpuMatrix& v) {
+  v.randomizeUniform();
+}
+
+// init a tuple which contains a set of arguments.
+template <std::size_t I = 0, typename... Args>
+inline typename std::enable_if<I == sizeof...(Args), void>::type initTuple(
+    std::tuple<Args...>& t) {}
+
+template <std::size_t I = 0, typename... Args>
+    inline typename std::enable_if <
+    I<sizeof...(Args), void>::type initTuple(std::tuple<Args...>& t) {
+  init(std::get<I>(t));
+  initTuple<I + 1>(t);
+}
+
+// copy a argument, copy src to dest
+template <typename T1, typename T2>
+void copy(T1& dest, T2& src) {
+  dest = src;
+}
+
+template <>
+void copy(GpuMatrix& dest, CpuMatrix& src) {
+  dest.copyFrom(src);
+}
+
+// copy a tuple, copy src to dest
+template <std::size_t I = 0, typename... Args1, typename... Args2>
+inline typename std::enable_if<I == sizeof...(Args1), void>::type copyTuple(
+    std::tuple<Args1...>& dest, std::tuple<Args2...>& src) {}
+
+template <std::size_t I = 0, typename... Args1, typename... Args2>
+    inline typename std::enable_if <
+    I<sizeof...(Args1), void>::type copyTuple(std::tuple<Args1...>& dest,
+                                              std::tuple<Args2...>& src) {
+  copy(std::get<I>(dest), std::get<I>(src));
+  copyTuple<I + 1>(dest, src);
+}
+
+// call member function
+template <typename C,
+          typename FC,
+          typename R,
+          typename... FArgs,
+          typename... Args>
+R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
+  return (obj.*f)(args...);
+}
+
+template <typename T>
+class ReturnType {
+ public:
+  typedef T type;
+};
+
+template <>
+class ReturnType<CpuMatrix> {
+ public:
+  typedef GpuMatrix type;
+};
+
+template <>
+class ReturnType<CpuIVector> {
+ public:
+  typedef GpuIVector type;
+};
+
+template <>
+class ReturnType<CpuSparseMatrix> {
+ public:
+  typedef GpuSparseMatrix type;
+};
+
+template <typename T>
+typename ReturnType<T>::type autoArgs(T& v) {
+  return v;
+}
+
+template <>
+GpuMatrix autoArgs(CpuMatrix& v) {
+  GpuMatrix a(v.getHeight(), v.getWidth());
+  a.copyFrom(v);
+  return a;
+}
+
+template <>
+GpuIVector autoArgs(CpuIVector& v) {
+  GpuIVector a(v.getSize());
+  a.copyFrom(v);
+  return a;
+}
+
+template <>
+GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
+  GpuSparseMatrix a(v.getHeight(),
+                    v.getWidth(),
+                    v.getElementCnt(),
+                    v.getValueType(),
+                    v.getFormat());
+  a.copyFrom(v, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  return a;
+}
+
+class AutoCompare {
+ public:
+  /**
+   * err is the allowed calculation error.
+   * The smaller the value of err,
+   * the stricter the comparison is between CPU and GPU calculations.
+   */
+  AutoCompare(size_t height, size_t width, real err = 1e-3)
+      : cpu(height, width), gpu(height, width), compare(err) {
+    init(cpu);
+    copy(gpu, cpu);
+  }
+
+  template <typename C, typename R, typename... FArgs, typename... Args>
+  void cmpWithArg(R (C::*f)(FArgs...), Args&&... args) {
+    static_assert(sizeof...(FArgs) == sizeof...(Args),
+                  "size of parameter packs are not equal");
+    call(cpu, f, args...);
+    call(gpu, f, autoArgs(args)...);
+
+    TensorCheck(compare, cpu, gpu);
+  }
+
+  template <std::size_t... I, typename C, typename R, typename... Args>
+  void cmpWithoutArg(R (C::*f)(Args...), size_t height, size_t width) {
+    static_assert(sizeof...(I) == sizeof...(Args),
+                  "size of parameter packs are not equal");
+    (void)height;
+    (void)width;
+    auto tuple1 = std::make_tuple(
+        construct<typename ReplaceType<
+            typename std::decay<
+                typename std::tuple_element<I,
+                                            std::tuple<Args...>>::type>::type,
+            CpuMatrix>::type>(height, width)...);
+
+    auto tuple2 = std::make_tuple(
+        construct<typename ReplaceType<
+            typename std::decay<
+                typename std::tuple_element<I,
+                                            std::tuple<Args...>>::type>::type,
+            GpuMatrix>::type>(height, width)...);
+
+    initTuple(tuple1);
+    copyTuple(tuple2, tuple1);
+
+    call(cpu, f, std::get<I>(tuple1)...);
+    call(gpu, f, std::get<I>(tuple2)...);
+
+    TensorCheck(compare, cpu, gpu);
+  }
+
+ protected:
+  CpuMatrix cpu;
+  GpuMatrix gpu;
+  AssertEqual compare;
+};
+
+}  // namespace autotest
diff --git a/paddle/legacy/math/tests/test_Allocator.cpp b/paddle/legacy/math/tests/test_Allocator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..710b55f57e5468bf8ccd3ceb49821f5832cffb90
--- /dev/null
+++ b/paddle/legacy/math/tests/test_Allocator.cpp
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Util.h"
+#define private public
+#include "paddle/legacy/math/Allocator.h"
+#include "paddle/legacy/math/MemoryHandle.h"
+#include "paddle/legacy/math/PoolAllocator.h"
+
+using namespace paddle;  // NOLINT
+
+template <typename Allocator>
+void testPoolAllocator() {
+  PoolAllocator* pool =
+      new PoolAllocator(new Allocator(), /* sizeLimit */ 1024);
+
+  /* alloc from system memory */
+  void* ptr1 = pool->alloc(10);
+  void* ptr2 = pool->alloc(200);
+  void* ptr3 = pool->alloc(200);
+  pool->free(ptr1, 10);
+  pool->free(ptr2, 200);
+  pool->free(ptr3, 200);
+  pool->printAll();
+  EXPECT_EQ((size_t)2, pool->pool_.size());
+  EXPECT_EQ((size_t)1, pool->pool_[10].size());
+  EXPECT_EQ((size_t)2, pool->pool_[200].size());
+  EXPECT_EQ(ptr1, pool->pool_[10][0]);
+  EXPECT_EQ(ptr2, pool->pool_[200][0]);
+  EXPECT_EQ(ptr3, pool->pool_[200][1]);
+
+  /* alloc from pool */
+  void* ptr4 = pool->alloc(10);
+  void* ptr5 = pool->alloc(200);
+  pool->printAll();
+  EXPECT_EQ((size_t)0, pool->pool_[10].size());
+  EXPECT_EQ((size_t)1, pool->pool_[200].size());
+  EXPECT_EQ(ptr1, ptr4);
+  EXPECT_EQ(ptr3, ptr5);
+  pool->free(ptr4, 10);
+  pool->free(ptr5, 200);
+
+  /* alloc size > sizeLimit */
+  void* ptr6 = pool->alloc(1024);
+  pool->free(ptr6, 1024);
+  EXPECT_LE((size_t)1024, pool->poolMemorySize_);
+
+  void* ptr7 = pool->alloc(1);
+  EXPECT_EQ((size_t)0, pool->poolMemorySize_);
+  EXPECT_EQ((size_t)0, pool->pool_.size());
+  pool->free(ptr7, 1);
+
+  delete pool;
+}
+
+TEST(Allocator, Pool) {
+  testPoolAllocator<CpuAllocator>();
+#ifdef PADDLE_WITH_CUDA
+  testPoolAllocator<GpuAllocator>();
+#endif
+}
+
+TEST(MemoryHandle, Cpu) {
+  for (auto size : {10, 30, 50, 100, 200, 512, 1000, 1023, 1024, 1025, 8193}) {
+    CpuMemoryHandle handle(size);
+    EXPECT_LE(handle.getSize(), handle.getAllocSize());
+  }
+
+  void* ptr1;
+  void* ptr2;
+  {
+    CpuMemoryHandle handle(256);
+    ptr1 = handle.getBuf();
+  }
+  {
+    CpuMemoryHandle handle(256);
+    ptr2 = handle.getBuf();
+  }
+  EXPECT_EQ(ptr1, ptr2);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(MemoryHandle, Gpu) {
+  int numGpu = hl_get_device_count();
+
+  /* alloc from system memory */
+  void* ptr3[numGpu];
+  void* ptr4[numGpu];
+  for (int i = 0; i < numGpu; i++) {
+    SetDevice device(i);
+    GpuMemoryHandle handle1(30);
+    GpuMemoryHandle handle2(30);
+    GpuMemoryHandle handle3(4000);
+    GpuMemoryHandle handle4(500);
+    ptr3[i] = handle3.getBuf();
+    ptr4[i] = handle4.getBuf();
+  }
+
+  /* alloc from pool */
+  for (int i = 0; i < numGpu; i++) {
+    SetDevice device(i);
+    GpuMemoryHandle handle1(30);
+    GpuMemoryHandle handle3(4000);
+    GpuMemoryHandle handle4(500);
+    EXPECT_EQ(ptr3[i], handle3.getBuf());
+    EXPECT_EQ(ptr4[i], handle4.getBuf());
+  }
+}
+#endif
diff --git a/paddle/legacy/math/tests/test_BaseMatrix.cpp b/paddle/legacy/math/tests/test_BaseMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..488765c6ac203ad064146faaab7b8c423d53cf0b
--- /dev/null
+++ b/paddle/legacy/math/tests/test_BaseMatrix.cpp
@@ -0,0 +1,247 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+/**
+ * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
+ * implementation of CPU and GPU member function in
+ * BaseMatrix.cpp and Matrix.cpp.
+ */
+
+#include <gtest/gtest.h>
+#include "TestUtils.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+
+using paddle::BaseMatrix;
+using paddle::Matrix;
+using autotest::AutoCompare;
+
+// Test all void (BaseMatrix::*)() function
+TEST(BaseMatrix, void) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)()) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg(f, height, width);
+      };
+
+      compare(&BaseMatrix::neg);
+      compare(&BaseMatrix::exp2);
+      compare(&BaseMatrix::log2);
+      compare(&BaseMatrix::sqrt2);
+      compare(&BaseMatrix::square2);
+      compare(&BaseMatrix::reciprocal2);
+      compare(&BaseMatrix::abs2);
+      compare(&BaseMatrix::sign2);
+      compare(&BaseMatrix::zero);
+      compare(&BaseMatrix::one);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(real) function
+TEST(BaseMatrix, real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0>(f, height, width);
+      };
+
+      compare(&BaseMatrix::pow2);
+      compare(&BaseMatrix::subScalar);
+      compare(&BaseMatrix::mulScalar);
+      compare(&BaseMatrix::divScalar);
+      compare(&BaseMatrix::assign);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::biggerThanScalar);
+      compare(&BaseMatrix::downClip);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&) function
+TEST(BaseMatrix, BaseMatrix) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0>(f, height, width);
+      };
+
+      compare(&BaseMatrix::assign);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::relu);
+      compare(&BaseMatrix::reluDerivative);
+      compare(&BaseMatrix::softrelu);
+      compare(&BaseMatrix::softreluDerivative);
+      compare(&BaseMatrix::brelu);
+      compare(&BaseMatrix::breluDerivative);
+      compare(&BaseMatrix::square2);
+      compare(&BaseMatrix::squareDerivative);
+      compare(&BaseMatrix::tanh);
+      compare(&BaseMatrix::tanhDerivative);
+      compare(&BaseMatrix::reciprocal2);
+      compare(&BaseMatrix::reciprocalDerivative);
+      compare(&BaseMatrix::abs2);
+      compare(&BaseMatrix::absDerivative);
+      compare(&BaseMatrix::sigmoid);
+      compare(&BaseMatrix::sigmoidDerivative);
+      compare(&BaseMatrix::expDerivative);
+      compare(&BaseMatrix::sign2);
+      compare(&BaseMatrix::exp2);
+      compare(&BaseMatrix::log2);
+      compare(&BaseMatrix::sqrt2);
+      compare(&BaseMatrix::dotMul);
+      compare(&BaseMatrix::dotMulSquare);
+      compare(&BaseMatrix::dotSquareMul);
+      compare(&BaseMatrix::addColVector);
+      compare(&BaseMatrix::addRowVector);
+      compare(&BaseMatrix::mulRowVector);
+      compare(&BaseMatrix::divRowVector);
+      compare(&BaseMatrix::mulColVector);
+      compare(&BaseMatrix::divColVector);
+      compare(&BaseMatrix::addP2P);
+      compare(&BaseMatrix::invSqrt);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(real, real) function
+TEST(BaseMatrix, real_real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(real, real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::clip);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&, real) function
+TEST(BaseMatrix, BaseMatrix_real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&, real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::addBias);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::sub);
+      compare(&BaseMatrix::pow2);
+      compare(&BaseMatrix::addScalar);
+      compare(&BaseMatrix::subScalar);
+      compare(&BaseMatrix::mulScalar);
+      compare(&BaseMatrix::divScalar);
+      compare(&BaseMatrix::scalarDiv);
+      compare(&BaseMatrix::addSquare);
+      compare(&BaseMatrix::isEqualTo);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&, BaseMatrix&) function
+TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height,
+                      width](void (BaseMatrix::*f)(BaseMatrix&, BaseMatrix&)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::softCrossEntropy);
+      compare(&BaseMatrix::softCrossEntropyBp);
+      compare(&BaseMatrix::binaryLabelCrossEntropy);
+      compare(&BaseMatrix::binaryLabelCrossEntropyBp);
+      compare(&BaseMatrix::sub);
+      compare(&BaseMatrix::add2);
+      compare(&BaseMatrix::dotMul);
+      compare(&BaseMatrix::dotDiv);
+      compare(&BaseMatrix::logisticRegressionLoss);
+      compare(&BaseMatrix::logisticRegressionLossBp);
+      compare(&BaseMatrix::biggerThan);
+      compare(&BaseMatrix::max2);
+      compare(&BaseMatrix::dotMulSquare);
+      compare(&BaseMatrix::dotSquareSquare);
+    }
+  }
+}
+
+void TestEelementWise(size_t height, size_t width) {
+  AutoCompare rowScale(height, width);
+  rowScale.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowScale, height, width);
+
+  AutoCompare rowDotMul(height, width);
+  rowDotMul.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowDotMul, height, width);
+
+  AutoCompare binaryClassificationError(height, width);
+  binaryClassificationError.cmpWithoutArg<0, 1, 2, 3>(
+      &BaseMatrix::binaryClassificationError, height, width);
+
+  AutoCompare sumOfSquaresBp(height, width);
+  sumOfSquaresBp.cmpWithoutArg<0, 1>(&Matrix::sumOfSquaresBp, height, width);
+}
+
+void TestAggregateToRow(size_t height, size_t width) {
+  AutoCompare maxCols(1, width);
+  maxCols.cmpWithoutArg<0>(&BaseMatrix::maxCols, height, width);
+
+  AutoCompare minCols(1, width);
+  minCols.cmpWithoutArg<0>(&BaseMatrix::minCols, height, width);
+
+  AutoCompare addDotMulVMM(1, width);
+  addDotMulVMM.cmpWithoutArg<0, 1>(&BaseMatrix::addDotMulVMM, height, width);
+
+  AutoCompare sumCols(1, width);
+  sumCols.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumCols, height, width);
+
+  AutoCompare collectBias(1, width);
+  collectBias.cmpWithoutArg<0, 1>(
+      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::collectBias),
+      height,
+      width);
+}
+
+void TestAggregateToCol(size_t height, size_t width) {
+  AutoCompare maxRows(height, 1);
+  maxRows.cmpWithoutArg<0>(&BaseMatrix::maxRows, height, width);
+
+  AutoCompare minRows(height, 1);
+  minRows.cmpWithoutArg<0>(&BaseMatrix::minRows, height, width);
+
+  AutoCompare sumRows(height, 1);
+  sumRows.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumRows, height, width);
+
+  AutoCompare sumOfSquares(height, 1);
+  sumOfSquares.cmpWithoutArg<0, 1>(&Matrix::sumOfSquares, height, width);
+}
+
+TEST(BaseMatrix, Other) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      TestEelementWise(height, width);
+      TestAggregateToRow(height, width);
+      TestAggregateToCol(height, width);
+    }
+  }
+}
+
+#endif
diff --git a/paddle/legacy/math/tests/test_CpuGpuVector.cpp b/paddle/legacy/math/tests/test_CpuGpuVector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..38071582001d6b4914b6600d12b3fd951e1023de
--- /dev/null
+++ b/paddle/legacy/math/tests/test_CpuGpuVector.cpp
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/utils/Util.h"
+#include "test_matrixUtil.h"
+
+using namespace paddle;  // NOLINT
+
+TEST(CpuGpuVector, getData) {
+  size_t size = 500;
+  hl_stream_t stream(HPPL_STREAM_DEFAULT);
+  CpuVectorPtr cpuVec = std::make_shared<CpuVector>(size);
+  GpuVectorPtr gpuVec = std::make_shared<GpuVector>(size);
+  cpuVec->uniform(0.0, 10.0);
+  gpuVec->copyFrom(*cpuVec, stream);
+  hl_stream_synchronize(stream);
+
+  CpuGpuVectorPtr vec = std::make_shared<CpuGpuVector>(gpuVec);
+  auto a = vec->getData(false);
+  auto b = cpuVec->getData();
+  hl_stream_synchronize(stream);
+  checkDataEqual(a, b, size);
+}
+
+TEST(CpuGpuVector, subCreate) {
+  size_t size1 = 1024;
+  size_t offset = 100;
+  size_t size2 = 500;
+  hl_stream_t stream(HPPL_STREAM_DEFAULT);
+  CpuGpuVectorPtr v1 = std::make_shared<CpuGpuVector>(size1, /*useGpu*/ false);
+  auto vec = v1->getMutableVector(false);
+  vec->uniform(0.0, 10.0);
+  auto v2 = std::make_shared<CpuGpuVector>(*v1, offset, size2);
+  CHECK_EQ(*v1->getSync(), *v2->getSync());
+
+  // check subVec equal
+  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
+
+  CpuVectorPtr v1Check = std::make_shared<CpuVector>(size1);
+  CpuVectorPtr v2Check = std::make_shared<CpuVector>(size2);
+  v1Check->copyFrom(*(v1->getVector(true)), stream);
+  v2Check->copyFrom(*(v2->getVector(true)), stream);
+  hl_stream_synchronize(stream);
+
+  checkDataEqual(v2->getData(false), v2Check->getData(), size2);
+  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
+
+  CpuVectorPtr noise = std::make_shared<CpuVector>(size2);
+  noise->uniform(0.0, 1.0);
+  auto v = v2->getMutableVector(false);  // will change header
+  // add noise to subVec
+  v->add(*noise);
+
+  // check v1_cpu_data == v2_cpu_data
+  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
+
+  v1Check->copyFrom(*(v1->getVector(true)), stream);
+  v2Check->copyFrom(*(v2->getVector(true)), stream);
+  hl_stream_synchronize(stream);
+
+  // check v1_gpu_data == v2_gpu_data
+  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
+}
+
+#endif
diff --git a/paddle/legacy/math/tests/test_ExecViaCpu.cpp b/paddle/legacy/math/tests/test_ExecViaCpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..55a3f5f50545d76bf7d62ed6b5a4b9fb8a590f45
--- /dev/null
+++ b/paddle/legacy/math/tests/test_ExecViaCpu.cpp
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/utils/PythonUtil.h>
+#include <paddle/utils/Util.h>
+#include <vector>
+#include "paddle/legacy/math/SparseMatrix.h"
+
+using namespace paddle;  // NOLINT
+
+const int height = 10;
+const int width = 16;
+
+real f(Matrix& mat1,
+       const Matrix& mat2,
+       IVector& vec1,
+       const IVector& vec2,
+       real scalar) {
+  CHECK(!mat1.useGpu());
+  CHECK(!mat2.useGpu());
+  CHECK(!vec1.useGpu());
+  CHECK(!vec2.useGpu());
+  mat1.copyFrom(mat2);
+  vec1.copyFrom(vec2);
+
+  return scalar;
+}
+
+class Functor {
+ public:
+  real operator()(Matrix& mat1,
+                  const Matrix& mat2,
+                  IVector& vec1,
+                  const IVector& vec2,
+                  real scalar) {
+    a_ = f(mat1, mat2, vec1, vec2, scalar);
+    return a_;
+  }
+
+ private:
+  real a_;
+};
+
+template <typename F>
+void testWrapper(F&& f) {
+  MatrixPtr cpumat1 = Matrix::create(height, width, false, /*useGpu=*/false);
+  MatrixPtr cpumat2 = Matrix::create(height, width, false, /*useGpu=*/false);
+
+  IVectorPtr cpuvec1 = IVector::create(height, /*useGpu=*/false);
+  IVectorPtr cpuvec2 = IVector::create(height, /*useGpu=*/false);
+
+  const real scalar = 1.23456;
+
+  MatrixPtr gpumat1 = Matrix::create(height, width, false, /*useGpu=*/true);
+  MatrixPtr gpumat2 = Matrix::create(height, width, false, /*useGpu=*/true);
+  IVectorPtr gpuvec1 = IVector::create(height, /*useGpu=*/true);
+  IVectorPtr gpuvec2 = IVector::create(height, /*useGpu=*/true);
+
+  cpumat2->randomizeUniform();
+  cpuvec2->rand(width);
+  gpumat2->copyFrom(*cpumat2);
+  gpuvec2->copyFrom(*cpuvec2);
+
+  real ret = execViaCpu(f, *gpumat1, *gpumat2, *gpuvec1, *gpuvec2, 1.23456);
+  EXPECT_EQ(ret, scalar);
+  cpumat1->copyFrom(*gpumat1);
+  cpuvec1->copyFrom(*gpuvec1);
+
+  for (int i = 0; i < height; ++i) {
+    EXPECT_EQ(cpuvec1->getElement(i), cpuvec2->getElement(i));
+    for (int j = 0; j < width; ++j) {
+      EXPECT_EQ(cpumat1->getElement(i, j), cpumat2->getElement(i, j));
+    }
+  }
+  gpumat1->resize(height, 1);
+  execViaCpu2(&CpuMatrix::selectElements, *gpumat1, *gpumat2, *gpuvec1);
+
+  cpumat1->resize(height, 1);
+  cpumat1->selectElements(*cpumat2, *cpuvec1);
+  for (int i = 0; i < height; ++i) {
+    EXPECT_EQ(cpumat1->getElement(i, 0), gpumat1->getElement(i, 0));
+  }
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(ExecViaCpu, test1) {
+  testWrapper(f);
+  testWrapper(&f);
+
+  auto lambda = [](Matrix& mat1,
+                   const Matrix& mat2,
+                   IVector& vec1,
+                   const IVector& vec2,
+                   real scalar) -> real {
+    return f(mat1, mat2, vec1, vec2, scalar);
+  };
+  LOG(INFO) << "lambda is_class=" << std::is_class<decltype(lambda)>::value
+            << " is_function=" << std::is_function<decltype(lambda)>::value;
+  testWrapper(lambda);
+
+  Functor functor;
+  testWrapper(functor);
+}
+#endif
diff --git a/paddle/legacy/math/tests/test_FPException.cpp b/paddle/legacy/math/tests/test_FPException.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6fd17f29695886044ab65c6ce78da1fc64ec0607
--- /dev/null
+++ b/paddle/legacy/math/tests/test_FPException.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/**
+ * This test is about floating point calculation exception.
+ * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
+ *
+ * Some exceptions occur in the middle of a set of formulas,
+ * that can be circumvented by some tricks.
+ * For example,
+ * calculate tanh
+ *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+ *
+ * If the result of (-2 * a) is too large,
+ * a FE_OVERFLOW exception occurs when calculating exp.
+ * But the result of tanh is no overflow problem,
+ * so we can add some tricks to prevent exp calculate an excessive value.
+ *
+ */
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Common.h"
+
+using namespace paddle;  // NOLINT
+
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+template <typename Matrix>
+void testTanh(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->tanh(*B);
+}
+
+template <typename Matrix>
+void testSigmoid(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->sigmoid(*B);
+}
+
+TEST(fp, overflow) {
+  for (auto illegal : {-90.0, 90.0}) {
+    LOG(INFO) << " illegal=" << illegal;
+    testTanh<CpuMatrix>(illegal);
+    testSigmoid<CpuMatrix>(illegal);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/math/tests/test_GpuProfiler.cpp b/paddle/legacy/math/tests/test_GpuProfiler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..450c9a035e369cb2d1ce5ba4e89d8e142d9af016
--- /dev/null
+++ b/paddle/legacy/math/tests/test_GpuProfiler.cpp
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/testing/TestUtil.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (fabs(a - b) > err) {
+        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
+          count++;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void testBilinearFwdBwd(int numSamples,
+                        int imgSizeH,
+                        int imgSizeW,
+                        int channels) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
+  real ratioH = 0.5;
+  real ratioW = 0.5;
+
+  // forward
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+
+  input->randomizeUniform();
+  inputGpu->copyFrom(*input);
+
+  {
+    // nvprof: GPU Proflier
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
+    target->bilinearForward(*input,
+                            imgSizeH,
+                            imgSizeW,
+                            2 * imgSizeH,
+                            2 * imgSizeW,
+                            channels,
+                            ratioH,
+                            ratioW);
+    targetGpu->bilinearForward(*inputGpu,
+                               imgSizeH,
+                               imgSizeW,
+                               2 * imgSizeH,
+                               2 * imgSizeW,
+                               channels,
+                               ratioH,
+                               ratioW);
+  }
+
+  // check
+  targetCheck->copyFrom(*targetGpu);
+  MatrixCheckErr(*target, *targetCheck);
+
+  // backward
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheckGrad =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->bilinearBackward(*targetGrad,
+                              2 * imgSizeH,
+                              2 * imgSizeW,
+                              imgSizeH,
+                              imgSizeW,
+                              channels,
+                              ratioH,
+                              ratioW);
+  inputGpuGrad->bilinearBackward(*targetGpuGrad,
+                                 2 * imgSizeH,
+                                 2 * imgSizeW,
+                                 imgSizeH,
+                                 imgSizeW,
+                                 channels,
+                                 ratioH,
+                                 ratioW);
+
+  // check
+  targetCheckGrad->copyFrom(*inputGpuGrad);
+  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+}
+
+TEST(Profiler, testBilinearFwdBwd) {
+  auto numSamples = 10;
+  auto channels = 16;
+  auto imgSize = 64;
+  {
+    // nvprof: GPU Proflier
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
+    // Paddle built-in timer
+    REGISTER_TIMER_INFO(
+        "testBilinearFwdBwd",
+        "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+    testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
+  }
+  globalStat.printAllStatus();
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+
+  // nvprof: GPU Proflier
+  REGISTER_GPU_PROFILER(
+      "RecursiveProfilingTest",
+      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+
+  return RUN_ALL_TESTS();
+}
+
+#endif
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/legacy/math/tests/test_Matrix.cpp
similarity index 100%
rename from paddle/math/tests/test_Matrix.cpp
rename to paddle/legacy/math/tests/test_Matrix.cpp
diff --git a/paddle/legacy/math/tests/test_RowBuffer.cpp b/paddle/legacy/math/tests/test_RowBuffer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ef8cd303d65f50cd18adb7f80fa18a665b67340
--- /dev/null
+++ b/paddle/legacy/math/tests/test_RowBuffer.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/RowBuffer.h"
+
+TEST(RowBuffer, testAutoGrow) {
+  paddle::RowBuffer buf(128);
+  ASSERT_EQ(128UL, buf.getWidth());
+  ASSERT_TRUE(buf.isAutoGrowth());
+  buf.resize(2);
+  ASSERT_EQ(2UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+
+  auto data = buf.getWithAutoGrowth(2);
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    data[i] = i;
+  }
+
+  ASSERT_EQ(3UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getRowCount() - 1; ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    ASSERT_NEAR(i, buf.get(2)[i], 1e-5);
+  }
+}
+
+TEST(RowBuffer, testWithMemBuf) {
+  paddle::CpuMemHandlePtr mem =
+      std::make_shared<paddle::CpuMemoryHandle>(128 * 2 * sizeof(real));
+  paddle::RowBuffer buf(mem, 128);
+  ASSERT_TRUE(!buf.isAutoGrowth());
+  ASSERT_EQ(2UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.getWithAutoGrowth(i)[j], 1e-5);
+    }
+  }
+
+  ASSERT_DEATH_IF_SUPPORTED(buf.getWithAutoGrowth(3), ".*");
+}
diff --git a/paddle/legacy/math/tests/test_SIMDFunctions.cpp b/paddle/legacy/math/tests/test_SIMDFunctions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..eef281b3f7c46a2957e4ae75e8568280639d24c4
--- /dev/null
+++ b/paddle/legacy/math/tests/test_SIMDFunctions.cpp
@@ -0,0 +1,171 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/math/SIMDFunctions.h"
+#include "paddle/utils/Util.h"
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <stdlib.h>
+#include <time.h>
+
+static constexpr size_t VECTOR_LEN = 3072;
+static constexpr size_t BATCH_SIZE = 64;
+static constexpr size_t ALIGN = 32;
+static_assert(VECTOR_LEN % ALIGN == 0, "VECTOR_LEN % ALIGN == 0");
+static_assert(BATCH_SIZE % ALIGN == 0, "BATCH_SIZE % ALIGN == 0");
+static constexpr float EPSILON = 1e-5;
+static std::mt19937 RandomEngine(time(0));
+
+inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
+                                                 size_t align = ALIGN) {
+  float* ptr;
+  CHECK_EQ(posix_memalign((void**)&ptr, align, len * sizeof(float)), 0);
+  return std::unique_ptr<float[]>(ptr);
+}
+
+inline static std::unique_ptr<float[]> NewRandomVector(size_t len = VECTOR_LEN,
+                                                       size_t align = ALIGN) {
+  std::uniform_real_distribution<float> dist(-100.0f, 100.0f);
+  auto generator = std::bind(dist, RandomEngine);
+  auto retv = NewVector(len, align);
+  std::generate_n(retv.get(), len, generator);
+  return retv;
+}
+
+TEST(SIMDFunction, addTo) {
+  typedef std::function<void(float*, const float*, size_t)> AddToMethodType;
+
+  AddToMethodType naive = paddle::simd::naive::addTo<float>;
+  AddToMethodType simd = paddle::simd::addTo<float>;
+
+  auto A = NewRandomVector();
+  auto B = NewRandomVector();
+
+  auto ACopy = NewVector();
+  memcpy(ACopy.get(), A.get(), VECTOR_LEN * sizeof(float));
+
+  naive(A.get(), B.get(), VECTOR_LEN);
+  simd(ACopy.get(), B.get(), VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, batchAddTo) {
+  auto A = NewRandomVector();
+  auto ACopy = NewVector();
+  memcpy(ACopy.get(), A.get(), sizeof(float) * VECTOR_LEN);
+
+  std::vector<std::unique_ptr<float[]>> B;
+  for (size_t i = 0; i < BATCH_SIZE; ++i) {
+    B.emplace_back(NewRandomVector());
+  }
+  std::unique_ptr<float* []> BRaw(new float*[BATCH_SIZE]);
+  for (size_t i = 0; i < BATCH_SIZE; ++i) {
+    BRaw[i] = B[i].get();
+  }
+
+  typedef std::function<void(float*, const float**, int, size_t)>
+      BatchAddToMethodType;
+
+  BatchAddToMethodType naive = paddle::simd::naive::batchAddTo<float>;
+  BatchAddToMethodType simd = paddle::simd::batchAddTo<float>;
+
+  naive(A.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
+  simd(ACopy.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, colMax) {
+  auto A = NewRandomVector(VECTOR_LEN * BATCH_SIZE);
+  auto naiveResult = NewVector(BATCH_SIZE);
+  auto simdResult = NewVector(BATCH_SIZE);
+
+  typedef std::function<void(float*, const float*, int, int)> ColMaxMethodType;
+  ColMaxMethodType naive = paddle::simd::naive::colMax<float>;
+  ColMaxMethodType simd = paddle::simd::colMax<float>;
+
+  naive(naiveResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
+  simd(simdResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
+
+  for (size_t i = 0; i < BATCH_SIZE; ++i) {
+    ASSERT_NEAR(naiveResult[i], simdResult[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, decayL1_WithLR) {
+  auto dest = NewRandomVector();
+  auto src = NewRandomVector();
+  auto lr = NewRandomVector();
+  auto lambda = 0.23f;
+
+  auto simd_dest = NewVector();
+  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
+
+  typedef std::function<void(float*, float*, float*, float, size_t)>
+      DecayL1MethodType;
+
+  DecayL1MethodType naive = [](
+      float* d, float* s, float* lr, float l, size_t len) {
+    paddle::simd::naive::decayL1<float>(d, s, lr, l, len);
+  };
+
+  DecayL1MethodType simd = [](
+      float* d, float* s, float* lr, float l, size_t len) {
+    paddle::simd::decayL1<float>(d, s, lr, l, len);
+  };
+
+  naive(dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
+  simd(simd_dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, decayL1_WithoutLR) {
+  auto dest = NewRandomVector();
+  auto src = NewRandomVector();
+  auto lambda = 0.23;
+
+  auto simd_dest = NewVector();
+  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
+
+  typedef std::function<void(float*, float*, float, size_t)> DecayL1MethodType;
+
+  DecayL1MethodType naive = [](float* d, float* s, float l, size_t len) {
+    paddle::simd::naive::decayL1<float>(d, s, l, len);
+  };
+
+  DecayL1MethodType simd = [](float* d, float* s, float l, size_t len) {
+    paddle::simd::decayL1<float>(d, s, l, len);
+  };
+
+  naive(dest.get(), src.get(), lambda, VECTOR_LEN);
+  simd(simd_dest.get(), src.get(), lambda, VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
+  }
+}
diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/legacy/math/tests/test_SparseMatrix.cpp
similarity index 100%
rename from paddle/math/tests/test_SparseMatrix.cpp
rename to paddle/legacy/math/tests/test_SparseMatrix.cpp
diff --git a/paddle/legacy/math/tests/test_Tensor.cu b/paddle/legacy/math/tests/test_Tensor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3ce056d66140059be8145f7f49bb80cbff4686eb
--- /dev/null
+++ b/paddle/legacy/math/tests/test_Tensor.cu
@@ -0,0 +1,1162 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "TensorCheck.h"
+#include "paddle/legacy/math/Matrix.h"
+
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::CpuVector;
+using paddle::GpuVector;
+using paddle::CpuIVector;
+using paddle::GpuIVector;
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+
+#define INIT_UNARY(A1, A2)  \
+  Tensor A1(height, width); \
+  Tensor A2(height, width); \
+  A1.randomizeUniform();    \
+  A2.copyFrom(A1)
+#define INIT_BINARY(A1, A2, B) \
+  INIT_UNARY(A1, A2);          \
+  Tensor B(height, width);     \
+  B.randomizeUniform()
+#define INIT_TERNARY(A1, A2, B, C) \
+  INIT_BINARY(A1, A2, B);          \
+  Tensor C(height, width);         \
+  C.randomizeUniform()
+#define INIT_QUATERNARY(A1, A2, B, C, D) \
+  INIT_TERNARY(A1, A2, B, C);            \
+  Tensor D(height, width);               \
+  D.randomizeUniform()
+
+template <typename Tensor>
+struct TestUnaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
+
+  explicit TestUnaryMatrix(UnaryFunc testUnaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_UNARY(A1, A2);
+        testUnaryFunc(A1, A2);
+      }
+    }
+  }
+};
+
+template <typename Tensor>
+struct TestBinaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
+
+  explicit TestBinaryMatrix(BinaryFunc testBinaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_BINARY(A1, A2, B);
+        testBinaryFunc(A1, A2, B);
+      }
+    }
+  }
+};
+
+template <typename Tensor>
+struct TestTernaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)>
+      TernaryFunc;
+
+  explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_TERNARY(A1, A2, B, C);
+        testTernaryFunc(A1, A2, B, C);
+      }
+    }
+  }
+};
+
+template <typename Tensor>
+struct TestQuaternaryMatrix {
+  typedef std::function<void(
+      Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)>
+      QuaternaryFunc;
+
+  explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_QUATERNARY(A1, A2, B, C, D);
+        testQuaternaryFunc(A1, A2, B, C, D);
+      }
+    }
+  }
+};
+
+template <typename Tensor, class T>
+struct TestUnaryVectorT {
+  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
+
+  explicit TestUnaryVectorT(UnaryFunc testUnaryFunc) {
+    for (auto size : {1, 11, 73, 128, 200, 330, 512, 1000, 4210}) {
+      LOG(INFO) << " size=" << size;
+      Tensor A1(size);
+      Tensor A2(size);
+      if (typeid(T) == typeid(real)) {
+        A1.rand();
+      } else {
+        A1.rand(1000);
+      }
+      A2.copyFrom(A1);
+      testUnaryFunc(A1, A2);
+    }
+  }
+};
+
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+    }
+  }
+}
+
+template <typename Tensor>
+void testTensorAddScalar(Tensor& A1, Tensor& A2) {
+  real p1 = 2.5;
+  real p2 = 3.0;
+  A1.add(p1);  // a += p
+  A2 += p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(p1, p2);  // a = a * p1 + p2
+  A2 = A2 * p1 + p2;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSubScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.subScalar(p);  // a -= p
+  A2 -= p;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorMulScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.mulScalar(p);  // a *= p
+  A2 *= p;
+  TensorCheckEqual(A1, A2);
+
+  real learningRate = 0.7f;
+  real decayRate = 1.2f;
+  A1.applyL2(learningRate, decayRate);
+  A2 = A2 * (1.0f / (1.0f + learningRate * decayRate));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorDivScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.divScalar(p);  // a /= p
+  A2 /= p;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorNeg(Tensor& A1, Tensor& A2) {
+  A1.neg();  // a = -a
+  A2 = -A2;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAbs(Tensor& A1, Tensor& A2) {
+  A1.abs2();  // a = a > 0 ? a : -a
+  A2 = A2.abs();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSquare(Tensor& A1, Tensor& A2) {
+  A1.square2();  // a = a * a
+  A2 = A2.square();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2) {
+  A1.reciprocal2();  // a = 1.0f / a
+  A2 = A2.reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSign(Tensor& A1, Tensor& A2) {
+  A1.sign2();  // a = (a > 0) - (a < 0)
+  A2 = A2.sign();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAssign(Tensor& A1, Tensor& A2) {
+  A1.assign(1.5);  // a = p
+  A2 = A2.constant(1.5);
+  TensorCheckEqual(A1, A2);
+
+  A1.one();  // a = 1
+  A2 = A2.constant(1.0);
+  TensorCheckEqual(A1, A2);
+
+  A1.zero();  // a = 0
+  A2 = A2.constant(0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
+  testTensorAddScalar(A1, A2);
+  testTensorSubScalar(A1, A2);
+  testTensorMulScalar(A1, A2);
+  testTensorDivScalar(A1, A2);
+  testTensorNeg(A1, A2);
+  testTensorAbs(A1, A2);
+  testTensorSquare(A1, A2);
+  testTensorReciprocal(A1, A2);
+  testTensorSign(A1, A2);
+  testTensorAssign(A1, A2);
+}
+
+template <typename Tensor>
+void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
+  A1.add(2);  // a += p
+  A2 += 2;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(3, 2);  // a = a * p1 + p2
+  A2 = A2 * 3 + 2;
+  TensorCheckEqual(A1, A2);
+
+  testTensorNeg(A1, A2);
+  testTensorAbs(A1, A2);
+}
+
+TEST(Unary, BaseOp) {
+  TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
+  TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
+  TestUnaryVectorT<CpuIVector, int> testCpuIVector(
+      testUnaryBaseOpInt<CpuIVector>);
+
+#ifdef PADDLE_WITH_GPU
+  TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
+  TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
+  TestUnaryVectorT<GpuIVector, int> testGpuIVector(
+      testUnaryBaseOpInt<GpuIVector>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorExp(Tensor& A1, Tensor& A2) {
+  A1.exp2();  // a = exp(a)
+  A2 = A2.exp();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorLog(Tensor& A1, Tensor& A2) {
+  A1.log2();  // a = log(a)
+  A2 = A2.log();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSqrt(Tensor& A1, Tensor& A2) {
+  A1.sqrt2();  // a = sqrt(a)
+  A2 = A2.sqrt();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorPow(Tensor& A1, Tensor& A2) {
+  A1.pow2(3.2);  // a = pow(a, p)
+  A2 = A2.pow(3.2);
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testUnayrMathOp(Tensor& A1, Tensor& A2) {
+  testTensorExp(A1, A2);
+  testTensorLog(A1, A2);
+  testTensorSqrt(A1, A2);
+  testTensorPow(A1, A2);
+}
+
+TEST(Unary, MathOp) {
+  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorClip(Tensor& A1, Tensor& A2) {
+  real p1 = 0.003f;
+  real p2 = 0.877f;
+  A1.clip(p1, p2);  // a = a < p1 ? p1 : (a > p2 ? p2 : a)
+  // A2 = A2.min(0.877f).max(0.003f);
+  A2 = (A2 < p1).condition(p1, (A2 > p2).condition(p2, A2));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
+  real p = 0.5f;
+  A1.biggerThanScalar(p);  // a = a > p ? 1.0f : 0.0f
+  A2 = (A2 > p).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorapplyL1(Tensor& A1, Tensor& A2) {
+  /**
+   * T lambda = p;
+   * a = (a > lambda) ? (a - lambda)
+   *                  : (a < -lambda) ? (a + lambda) : 0
+   *
+   * p = learningRate * decayRate;
+   */
+  real learningRate = 0.7f;
+  real decayRate = 0.6f;
+  A1.applyL1(learningRate, decayRate);
+  A2 = (A2 > (learningRate * decayRate))
+           .condition(
+               (A2 - (learningRate * decayRate)),
+               (A2 < -(learningRate * decayRate))
+                   .condition((A2 + (learningRate * decayRate)), (real)0.0));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
+  testTensorClip(A1, A2);
+  testTensorBiggerThanScalar(A1, A2);
+
+  A1.randomizeUniform();
+  A1.subScalar(0.5f);
+  A2.copyFrom(A1);
+  testTensorapplyL1(A1, A2);
+}
+
+TEST(Unary, CompareOp) {
+  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.2;
+  A1.add(B);  // a += b
+  A2 += B;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(B, p1);  // a += b * p
+  A2 += B * p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(B, p1, p2);  // a = p1 * a + p2 * b
+  A2 = A2 * p1 + B * p2;
+  TensorCheckEqual(A1, A2);
+
+  A1.addScalar(B, p1);  // a = b + p
+  A2 = B + p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.addSquare(B, p1);  // a += p * b * b
+  A2 += B.constant(p1) * B * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.decayAddSquare(B, p1, p2);  // a = p1 * a + p2 * b * b
+  A2 = A2 * p1 + B.constant(p2) * B * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.sub(B);  // a -= b
+  A2 -= B;
+  TensorCheckEqual(A1, A2);
+
+  A1.sub(B, p);  // a -= b * p
+  A2 -= B * p;
+  TensorCheckEqual(A1, A2);
+
+  A1.subScalar(B, p);  // a = b - p
+  A2 = B - p;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.mulScalar(B, p);  // a = b * p
+  A2 = B * p;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMulSquare(B);  // a *= b * b
+  A2 *= B * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotSquareMul(B);  // a = a * a * b
+  A2 = A2 * A2 * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMul(B);  // a *= b
+  A2 *= B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.divScalar(B, p);  // a = b / p
+  A2 = B / p;
+  TensorCheckEqual(A1, A2);
+
+  A1.scalarDiv(B, p);  // a = p / b
+  A2 = B.constant(p) / B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.assign(B);  // a = b
+  A2 = B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.square2(A1);  // b = a * a
+  A2 = B.square();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.squareDerivative(B);  // a *= 2.0 * b
+  A2 = A2 * (real)2.0 * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.reciprocal2(A1);  // b = 1.0f / a
+  A2 = B.reciprocal();
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 0.58;
+  real p2 = 0.32;
+  A1.reciprocal2(B, p1, p2);  // a = 1 / (p1 * b + p2)
+  A2 = (B * p1 + p2).reciprocal();
+  TensorCheckEqual(A1, A2);
+
+  real learningRate = 0.7f;
+  real decayRate = 1.2f;
+  A1.applyL2(B, learningRate, decayRate);  // a *= (1.0f / (1.0f + p * b))
+  A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B)
+            .reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.reciprocalDerivative(B);  // a *= -b * b
+  A2 *= (-B) * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.sign2(A1);  // b = a > 0.0f ? 1.0f : -1.0f
+  A2 = B.sign();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.abs2(A1);  // b = a > 0.0f ? a : -a
+  A2 = B.abs();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  testTensorAdd(A1, A2, B);
+  testTensorSub(A1, A2, B);
+  testTensorMul(A1, A2, B);
+  testTensorDiv(A1, A2, B);
+  testTensorSquare(A1, A2, B);
+  testTensorSquareDerivative(A1, A2, B);
+  testTensorReciprocal(A1, A2, B);
+  testTensorReciprocalDerivative(A1, A2, B);
+  testTensorAbs(A1, A2, B);
+  testTensorSign(A1, A2, B);
+  testTensorAssign(A1, A2, B);
+}
+
+TEST(Binary, BaseOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = exp(b)
+  A1.exp2(B);
+  A2 = B.exp();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.expDerivative(B);  // a *= b
+  A2 *= B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = log(b)
+  A1.log2(B);
+  A2 = B.log();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = sqrt(b)
+  A1.sqrt2(B);
+  A2 = B.sqrt();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = 1.0f / sqrt(b)
+  A1.invSqrt(B);
+  A2 = B.sqrt().reciprocal();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.pow2(B, 2.5f);  // a = pow(b, p)
+  A2 = B.pow(2.5f);
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * const T THRESHOLD = 40.0;
+   * b = log(1.0 +
+   *         exp((a > THRESHOLD) ? THRESHOLD
+   *             : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))
+   */
+  B.softrelu(A1);
+
+  real THRESHOLD = 40.0;
+  A2 = (B.constant(1.0f) +
+        (B > THRESHOLD)
+            .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))
+            .exp())
+           .log();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * const T THRESHOLD = 40.0;
+   * a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
+   *                             ? THRESHOLD
+   *                             : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
+   */
+  A1.softreluDerivative(B);
+  real THRESHOLD = 40.0;
+  A2 = A2 *
+       (B.constant(1.0f) -
+        (B.constant(-1.0f) *
+         (B > THRESHOLD)
+             .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)))
+            .exp());
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+    const T THRESHOLD_MIN = -40.0;
+    const T THRESHOLD_MAX = 13.0;
+    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
+            : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
+    b = 1.0f / (1.0f + exp(-tmp)))
+   */
+  B.sigmoid(A1);
+
+  const real THRESHOLD_MIN = -40.0;
+  const real THRESHOLD_MAX = 13.0;
+  auto tmp = (B < THRESHOLD_MIN)
+                 .condition(THRESHOLD_MIN,
+                            (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
+  A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.sigmoidDerivative(B);  // a *= b * (1 - b)
+  A2 *= B * (B.constant(1.0f) - B);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.tanh(A1);  // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+  A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.tanhDerivative(B);  // a *= 1 - b * b
+  A2 *= B.constant(1.0f) - B * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.1;
+  // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
+  B.scaledTanh(A1, p1, p2);
+  A2 = B.constant(p1) *
+       (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) -
+        (real)1.0);
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.1;
+  // a *= (p2 / p1) * (p1 * p1 - b * b));
+  A1.scaledTanhDerivative(B, p1, p2);
+  A2 = A2 * (B.constant(p2 / p1) * (B.constant(p1 * p1) - B * B));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  testTensorTanhDerivative(A1, A2, B);
+  testTensorScaledTanhDerivative(A1, A2, B);
+  testTensorSigmoidDerivative(A1, A2, B);
+  testTensorExpDerivative(A1, A2, B);
+  testTensorScaledTanh(A1, A2, B);
+  testTensorTanh(A1, A2, B);
+  testTensorExp(A1, A2, B);
+  testTensorLog(A1, A2, B);
+  testTensorSqrt(A1, A2, B);
+  testTensorInvSqrt(A1, A2, B);
+  testTensorPow(A1, A2, B);
+
+  testTensorSoftrelu(A1, A2, B);
+  testTensorSoftreluDerivative(A1, A2, B);
+  testTensorSigmoid(A1, A2, B);
+}
+
+TEST(Binary, MathOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.relu(A1);  // b = a > 0.0f ? a : 0.0f
+  A2 = (B > (real)0.0f).condition(B, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.reluDerivative(B);  // a *= (b > 0.0f ? 1.0f : 0.0f)
+  A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * b = a > p1 ? a : p1
+   * b = b < p2 ? b : p2
+   * int p1 = 0, p2 = 24;
+   */
+  SetTensorValue(B, 32.0f);
+  B.brelu(A1);
+  auto tmp = (B > (real)0.0f).condition(B, (real)0.0f);
+  A2 = (tmp < (real)24.0f).condition(tmp, (real)24.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  SetTensorValue(B, 32.0f);
+  /*
+   * a *= (b > p1 && b < p2) ? 1.0 : 0.0
+   * int p1 = 0, p2 = 24;
+   */
+  A1.breluDerivative(B);
+  A2 *= (B > (real)0.0f && B < (real)24.0f).condition((real)1.0f, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.absDerivative(B);  // a = (b > 0) ? a : (b < 0) ? -a : 0
+  A2 = (B > (real)0.0f)
+           .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 0.613;
+  SetTensorValue(B, p);
+  A1.isEqualTo(B, p);  // a = (b == p)
+  A2 = (B == p);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
+  /**
+   * T lambda = p * b;
+   * a = (a > lambda) ? (a - lambda)
+   *                  : (a < -lambda) ? (a + lambda) : 0
+   *
+   * p = learningRate * decayRate;
+   */
+  real learningRate = 0.7f;
+  real decayRate = 0.6f;
+  A1.applyL1(B, learningRate, decayRate);
+  auto lambda = B.constant(learningRate * decayRate) * B;
+  A2 = (A2 > lambda)
+           .condition((A2 - lambda),
+                      (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.subScalar(0.5f);
+  SetTensorValue(B, 0.0f);
+  testTensorReluDerivative(A1, A2, B);
+
+  A1.randomizeUniform();
+  A2.copyFrom(A1);
+  testTensorBreluDerivative(A1, A2, B);
+
+  testTensorAbsDerivative(A1, A2, B);
+  testTensorRelu(A1, A2, B);
+  testTensorBrelu(A1, A2, B);
+  testTensorIsEqualTo(A1, A2, B);
+}
+
+TEST(Binary, CompareOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.add(B, C);  // a = b + c
+  A2 = B + C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  real p3 = 3.8;
+  A1.add(B, p1, C, p2);  // a = p1 * b + p2 * c
+  A2 = B * p1 + C * p2;
+  TensorCheckEqual(A1, A2);
+
+  A1.add2(B, C);  // a = a + b + c
+  A2 = A2 + B + C;
+  TensorCheckEqual(A1, A2);
+
+  A1.add2(B, C, p1, p2, p3);  // a = p1 * a + p2 * b + p3 * c
+  A2 = A2 * p1 + B * p2 + C * p3;
+  TensorCheckEqual(A1, A2);
+
+  A1.decayAddSquareMul(B, C, p1, p2);  // a = p1 * a + p2 * b * b * c * c
+  A2 = A2 * p1 + B.constant(p2) * B * B * C * C;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.sub(B, C);  // a = b - c
+  A2 = B - C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  A1.sub(B, p1, C, p2);  // a = p1 * b - p2 * c
+  A2 = B * p1 - C * p2;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.dotMul(B, C);  // a = b * c
+  A2 = B * C;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMulSquare(B, C);  // a = b * c * c
+  A2 = B * C * C;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotSquareSquare(B, C);  // a = b * b * c * c
+  A2 = B * B * C * C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+
+  /*
+   * T tmp = p1 * b + p2 * c;
+   * a *= tmp * tmp
+   */
+  A1.dotMulSquareSum(B, C, p1, p2);
+  auto tmp = B * p1 + C * p2;
+  A2 *= tmp * tmp;
+  TensorCheckEqual(A1, A2);
+
+  /*
+   * T tmp = p1 * b + p2 * c;
+   * a = tmp * tmp
+   */
+  A1.dotSquareSum(B, C, p1, p2);
+  auto tmp2 = B * p1 + C * p2;
+  A2 = tmp2 * tmp2;
+  TensorCheckEqual(A1, A2);
+
+  // a *= p1 * b + p2 * c
+  A1.dotMulSum(B, C, p1, p2);
+  A2 *= B * p1 + C * p2;
+  TensorCheckEqual(A1, A2);
+
+  // a = p1 * a + p2 * b * c
+  A1.addDotMul(B, C, p1, p2);
+  A2 = A2 * p1 + B.constant(p2) * B * C;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.dotDiv(B, C);  // a = (b == 0.0) ? 0.0 : b / c
+  A2 = (B == (real)0.0).condition((real)0.0, B / C);
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  A1.dotDiv(B, C, p1, p2);  // a = (b + p1) / (c + p2)
+  A2 = (B + p1) / (C + p2);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  real p1 = 1.5;
+  real p2 = 2.5;
+  real p3 = 3.5;
+  A1.reciprocalSum(B, C, p1, p2, p3);  // a = 1 / (p1 * b + p2 * c + p3)
+  A2 = (B * p1 + C * p2 + p3).reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.softCrossEntropy(B, C);  // a = -c * log(b) - (1 - c) * log(1 - b)
+  A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSoftCrossEntropyBp(Tensor& A1,
+                                  Tensor& A2,
+                                  Tensor& B,
+                                  Tensor& C) {
+  A1.softCrossEntropyBp(B, C);  // a += (b - c) / (b * (1 - b))
+  A2 += (B - C) / (B * (B.constant(1.0f) - B));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  testTensorAdd(A1, A2, B, C);
+  testTensorSub(A1, A2, B, C);
+  testTensorMul(A1, A2, B, C);
+  testTensorDiv(A1, A2, B, C);
+  testTensorReciprocal(A1, A2, B, C);
+  testTensorSoftCrossEntropyBp(A1, A2, B, C);
+
+  testTensorSoftCrossEntropy(A1, A2, B, C);
+}
+
+TEST(Ternary, BaseOp) {
+  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorBinaryLabelCrossEntropy(Tensor& A1,
+                                       Tensor& A2,
+                                       Tensor& B,
+                                       Tensor& C) {
+  A1.binaryLabelCrossEntropy(B, C);  // a = c > 0.5 ? -log(b) : -log(1.0 - b)
+  A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log()));
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
+                                         Tensor& A2,
+                                         Tensor& B,
+                                         Tensor& C) {
+  // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
+  A1.binaryLabelCrossEntropyBp(B, C);
+  A2 += (C > (real)0.5)
+            .condition((B.constant(-1.0f) / B),
+                       (B.constant(1.0f) - B).reciprocal());
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorLogisticRegressionLoss(Tensor& A1,
+                                      Tensor& A2,
+                                      Tensor& B,
+                                      Tensor& C) {
+  SetTensorValue(B, 50.0f);
+  SetTensorValue(B, -50.0f);
+  /**
+   * const T THRESHOLD = 40.0;
+   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+   *                                        ? -THRESHOLD
+   *                                        : b;
+   * a = log(1 + exp(x)) - c * x
+   */
+  A1.logisticRegressionLoss(B, C);
+  real THRESHOLD = 40.0;
+  auto tmp =
+      (B > THRESHOLD)
+          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorLogisticRegressionLossBp(Tensor& A1,
+                                        Tensor& A2,
+                                        Tensor& B,
+                                        Tensor& C) {
+  SetTensorValue(B, 50.0f);
+  SetTensorValue(B, -50.0f);
+  /**
+   * const T THRESHOLD = 40.0;
+   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+   *                                        ? -THRESHOLD
+   *                                        : b;
+   * x = exp(x); a = x / (1 + x) - c
+   */
+  A1.logisticRegressionLossBp(B, C);
+  real THRESHOLD = 40.0;
+  auto tmp =
+      (B > THRESHOLD)
+          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  auto tmp2 = tmp.exp();
+  A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.biggerThan(B, C);  // a = (b > c) ? 1.0f : 0.0f
+  A2 = (B > C).condition((real)1.0f, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.max2(B, C);  // a = (b > c) ? b : c
+  A2 = (B > C).condition(B, C);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
+  testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
+  testTensorBiggerThan(A1, A2, B, C);
+  testTensorMax(A1, A2, B, C);
+
+  testTensorLogisticRegressionLoss(A1, A2, B, C);
+  testTensorLogisticRegressionLossBp(A1, A2, B, C);
+}
+
+TEST(Ternary, CompareOp) {
+  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testQuaternaryAdd(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f);  // a = p1 * b + p2 * c + p3 * d
+  // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
+  // TensorCheckEqual(A1, A2);
+
+  /*
+   * T tmp = p1 * b + p2 * c + p3 * d;
+   * a += tmp * tmp
+   */
+  real p1 = 1.5f;
+  real p2 = 2.5f;
+  real p3 = 3.5f;
+  A1.addSquareSum(B, C, D, p1, p2, p3);
+  auto tmp = B * p1 + C * p2 + D * p3;
+  A2 += tmp * tmp;
+  TensorCheckEqual(A1, A2);
+}
+
+TEST(Quaternary, BaseOp) {
+  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorBiggerThan(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
+  A1.biggerThan(B, C, D);
+  A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5))
+           .condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorRankLoss(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  /**
+   * const T THRESHOLD = 40.0; a = b - c;
+   * a = (a > THRESHOLD)
+   *         ? THRESHOLD
+   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+   * a = log(1 + exp(a)) - a * d
+   */
+  A1.rankLoss(B, C, D);
+
+  real THRESHOLD = 40.0;
+  auto tmp = B - C;
+  auto tmp2 =
+      (tmp > THRESHOLD)
+          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
+
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorRankLossBp(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  /**
+   * const T THRESHOLD = 40.0; a = b - c;
+   * a = (a > THRESHOLD)
+   *         ? THRESHOLD
+   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+   * a = exp(a); a = (a / (1 + a) - d)
+   */
+  A1.rankLossBp(B, C, D);
+  real THRESHOLD = 40.0;
+  auto tmp = B - C;
+  auto tmp2 =
+      (tmp > THRESHOLD)
+          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  auto tmp3 = tmp2.exp();
+  A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
+
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testQuaternaryCompareOp(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  testTensorBiggerThan(A1, A2, B, C, D);
+  testTensorRankLoss(A1, A2, B, C, D);
+  testTensorRankLossBp(A1, A2, B, C, D);
+}
+
+TEST(Quaternary, CompareOp) {
+  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
+#endif
+}
diff --git a/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp b/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ae9cf111ad4259462be34795df4dbab685302b8
--- /dev/null
+++ b/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
@@ -0,0 +1,461 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "OriginalOptimizerApi.h"
+#include "PerfUtils.h"
+#include "TensorCheck.h"
+#include "paddle/legacy/math/TrainingAlgorithmOp.h"
+#include "paddle/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+
+#ifndef PADDLE_TYPE_DOUBLE
+DEFINE_double(max_diff, 1e-5, "max diff allowed");
+#else
+DEFINE_double(max_diff, 1e-13, "max diff allowed");
+#endif
+
+class SetMaxDiff {
+ public:
+  explicit SetMaxDiff(double max_diff) {
+    max_diff_ = FLAGS_max_diff;
+    FLAGS_max_diff = max_diff;
+  }
+  ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
+
+ private:
+  double max_diff_;
+};
+
+#define COPY_VECTOR_TO_CPU(cpuVec, vector)               \
+  do {                                                   \
+    if (vector->useGpu()) {                              \
+      cpuVec = Vector::create(vector->getSize(), false); \
+      cpuVec->copyFrom(*vector);                         \
+    } else {                                             \
+      cpuVec = vector;                                   \
+    }                                                    \
+  } while (0)
+
+int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+
+  const real* data1 = vector1.getData();
+  const real* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (fabs(a - b) > FLAGS_max_diff) {
+      if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
+        count++;
+      }
+    }
+  }
+
+  return count;
+}
+
+int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
+  VectorPtr tmp1;
+  VectorPtr tmp2;
+  COPY_VECTOR_TO_CPU(tmp1, vector1);
+  COPY_VECTOR_TO_CPU(tmp2, vector2);
+  return VectorCheckErr(*tmp1, *tmp2);
+}
+
+#ifdef PADDLE_DISABLE_TIMER
+
+#define CHECK_VECTORPTR(vector1, vector2) \
+  EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
+
+#else
+
+#define CHECK_VECTORPTR(vector1, vector2)
+
+#endif
+
+typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
+
+void testCase(testMatrixFunc matrixFunc) {
+#ifdef PADDLE_WITH_CUDA
+  for (auto useGpu : {false, true}) {
+#else
+  for (auto useGpu : {false}) {
+#endif
+    for (auto size : {1,
+                      32,
+                      64,
+                      128,
+                      512,
+                      1024,
+                      4096,
+                      32768,
+                      65536,
+                      131072,
+                      262144,
+                      524288,
+                      1048576,
+                      2097152}) {
+      LOG(INFO) << " size=" << size << " useGpu=" << useGpu;
+      matrixFunc(size, useGpu);
+    }
+  }
+}
+
+#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \
+  vec1[type] = Vector::create(size, useGpu);        \
+  vec2[type] = Vector::create(size, useGpu);        \
+  vec1[type]->rand();                               \
+  vec2[type]->copyFrom(*vec1[type]);
+
+void testAdagrad(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(
+      bufs1, epsilon, learningRate, momentum, decayRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(adagradApply(value,
+                                      grad,
+                                      mom,
+                                      accum_buffer,
+                                      accum,
+                                      lr,
+                                      epsilon,
+                                      learningRate,
+                                      momentum,
+                                      decayRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, Adagrad) { testCase(testAdagrad); }
+
+void testAdaDelta(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(
+      bufs1, rou, epsilon, learningRate, momentum, decayRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(adadeltaApply(value,
+                                       grad,
+                                       mom,
+                                       accum,
+                                       accum_update,
+                                       lr,
+                                       rou,
+                                       epsilon,
+                                       learningRate,
+                                       momentum,
+                                       decayRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, AdaDelta) { testCase(testAdaDelta); }
+
+template <bool isFirstTime>
+void testRMSProp(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  /* make sure 'g - f.square()' greater than 0 */
+  bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0);
+  bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom(
+      *bufs1[PARAMETER_GRADIENT_SQURESUM]);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  real accumulatedRou = rou;
+
+  EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1,
+                                                   accumulatedRou,
+                                                   rou,
+                                                   epsilon,
+                                                   learningRate,
+                                                   momentum,
+                                                   decayRate,
+                                                   isFirstTime));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(rmspropApply(value,
+                                      grad,
+                                      mom,
+                                      sum,
+                                      sum1,
+                                      lr,
+                                      accumulatedRou,
+                                      rou,
+                                      epsilon,
+                                      learningRate,
+                                      momentum,
+                                      decayRate,
+                                      isFirstTime));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, RMSProp) {
+  testCase(testRMSProp<true>);
+  testCase(testRMSProp<false>);
+}
+
+template <bool isFirstTime>
+void testDecayedAdagrad(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  real accumulatedRou = rou;
+
+  if (isFirstTime) {
+    bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
+    bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
+  }
+
+  EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1,
+                                                          accumulatedRou,
+                                                          rou,
+                                                          epsilon,
+                                                          learningRate,
+                                                          momentum,
+                                                          decayRate,
+                                                          isFirstTime));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(decayedAdagradApply(value,
+                                             grad,
+                                             mom,
+                                             sum,
+                                             lr,
+                                             accumulatedRou,
+                                             rou,
+                                             epsilon,
+                                             learningRate,
+                                             momentum,
+                                             decayRate,
+                                             isFirstTime));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, DecayedAdagrad) {
+  testCase(testDecayedAdagrad<false>);
+  testCase(testDecayedAdagrad<true>);
+}
+
+void testAdam(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu);
+
+  real beta1 = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta2 = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta1_power = (real)rand() / (real)RAND_MAX;   // NOLINT
+  real beta2_power = (real)rand() / (real)RAND_MAX;   // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdamParameterOptimizer(
+      bufs1, beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM];
+
+  EXPRESSION_PERFORMANCE(adamApply(value,
+                                   grad,
+                                   mom,
+                                   v,
+                                   beta1,
+                                   beta2,
+                                   beta1_power,
+                                   beta2_power,
+                                   epsilon,
+                                   learningRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM],
+                  bufs2[PARAMETER_SECOND_MOMENTUM]);
+}
+
+TEST(Training, Adam) { testCase(testAdam); }
+
+void testAdamax(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu);
+
+  real beta1 = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real beta2 = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real alpha = (real)rand() / (real)RAND_MAX;  // NOLINT
+  int64_t step = 2;
+
+  EXPRESSION_PERFORMANCE(
+      AdamaxParameterOptimizer(bufs1, beta1, beta2, step, alpha));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM];
+
+  EXPRESSION_PERFORMANCE(
+      adamaxApply(value, grad, mom, u, beta1, beta2, step, alpha));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM],
+                  bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]);
+}
+
+TEST(Training, Adamax) {
+#ifndef PADDLE_TYPE_DOUBLE
+  SetMaxDiff diff(1e-4);
+#endif
+  testCase(testAdamax);
+}
+
+void testSparseMomentum(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu);
+
+  real alpha = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta = (real)rand() / (real)RAND_MAX;          // NOLINT
+  real gamma = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real tau = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+
+  EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(
+      bufs1, alpha, beta, gamma, tau, learningRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT];
+  BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT];
+
+  EXPRESSION_PERFORMANCE(sparseMomentumApply(
+      value, grad, momU, momV, alpha, beta, gamma, tau, learningRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT], bufs2[PARAMETER_MOMENTUM_UT]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT], bufs2[PARAMETER_MOMENTUM_VT]);
+}
+
+TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/legacy/math/tests/test_batchTranspose.cpp
similarity index 100%
rename from paddle/math/tests/test_batchTranspose.cpp
rename to paddle/legacy/math/tests/test_batchTranspose.cpp
diff --git a/paddle/legacy/math/tests/test_lazyAssign.cu b/paddle/legacy/math/tests/test_lazyAssign.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cf8c3d77199571dff314446a1e1b14e9b746e947
--- /dev/null
+++ b/paddle/legacy/math/tests/test_lazyAssign.cu
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "PerfUtils.h"
+#include "TensorCheck.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/TensorAssign.h"
+
+using paddle::BaseMatrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+
+typedef std::function<void(int height, int width)> testMatrixFunc;
+void testMatrixCase(testMatrixFunc matrixFunc) {
+  for (auto height : {1}) {
+    for (auto width : {1,
+                       32,
+                       64,
+                       128,
+                       512,
+                       1024,
+                       4096,
+                       32768,
+                       65536,
+                       131072,
+                       262144,
+                       524288,
+                       1048576,
+                       2097152,
+                       4194304,
+                       8388608}) {
+      matrixFunc(height, width);
+    }
+  }
+}
+
+template <typename Tensor>
+void testLazyAssign(int height, int width) {
+  Tensor A1(height, width);
+  Tensor A2(height, width);
+  Tensor B(height, width);
+  Tensor C(height, width);
+  Tensor D(height, width);
+  A1.randomizeUniform();
+  B.randomizeUniform();
+  C.randomizeUniform();
+  D.randomizeUniform();
+  A2.copyFrom(A1);
+
+  EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
+
+  EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C);
+                         auto expr2 = A2.lazyAssign(A2 * D);
+                         AssignEvaluate(expr1, expr2););
+
+  TensorCheckErr(A1, A2);
+}
+
+TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
+
+#ifdef PADDLE_WITH_GPU
+TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
+#endif
+
+template <typename Tensor>
+void sgdUpdateTensor(
+    Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) {
+  C = C * p2 - D * (B + A * p3) * p1;
+  A += C;
+}
+
+void sgdUpdateLazyAssign(BaseMatrix& A,
+                         BaseMatrix& B,
+                         BaseMatrix& C,
+                         BaseMatrix& D,
+                         real p1,
+                         real p2,
+                         real p3) {
+  auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
+  auto expr2 = A.lazyAssign(A + C);
+  AssignEvaluate(expr1, expr2);
+}
+
+template <typename Tensor>
+void testSgdUpdate(int height, int width) {
+  Tensor A1(height, width);
+  Tensor A2(height, width);
+  Tensor A3(height, width);
+  A1.randomizeUniform();
+  A2.copyFrom(A1);
+  A3.copyFrom(A1);
+
+  Tensor B(height, width);
+  B.randomizeUniform();
+
+  Tensor C1(height, width);
+  Tensor C2(height, width);
+  Tensor C3(height, width);
+  C1.randomizeUniform();
+  C2.copyFrom(C1);
+  C3.copyFrom(C1);
+
+  Tensor D(height, width);
+  D.randomizeUniform();
+
+  real p1 = 0.2;
+  real p2 = 0.3;
+  real p3 = 0.5;
+
+  /**
+   * c = p2 * c - p1 * (b + p3 * a);
+   * a = a + c;
+   */
+  // BaseMatrix API
+  EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3););
+
+  // Tensor expression
+  EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
+
+  // lazyAssign
+  EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
+
+  TensorCheckErr(A1, A2);
+  TensorCheckErr(A1, A3);
+  TensorCheckErr(C1, C2);
+  TensorCheckErr(C1, C3);
+}
+
+TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
+
+#ifdef PADDLE_WITH_GPU
+TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
+#endif
diff --git a/paddle/legacy/math/tests/test_matrixCompare.cpp b/paddle/legacy/math/tests/test_matrixCompare.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..98521aeb04bf46bf0061f3fa27455a2089d0c8b1
--- /dev/null
+++ b/paddle/legacy/math/tests/test_matrixCompare.cpp
@@ -0,0 +1,1698 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+/// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
+/// only cpu version.
+
+#include <gtest/gtest.h>
+#include "TensorCheck.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/testing/TestUtil.h"
+#include "paddle/utils/DynamicLoader.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+
+void testMatrixMaxSequence(int batchSize, int inputDim) {
+  // forward
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  int newBatchSize = cpuSequence->getSize() - 1;
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
+  cpuOutput->zero();
+  gpuOutput->zero();
+
+  IVectorPtr cpuIndex = nullptr;
+  IVectorPtr gpuIndex = nullptr;
+  IVector::resizeOrCreate(cpuIndex, newBatchSize * inputDim, false);
+  IVector::resizeOrCreate(gpuIndex, newBatchSize * inputDim, true);
+  cpuIndex->zeroMem();
+  gpuIndex->zeroMem();
+
+  cpuOutput->maxSequenceForward(*cpuInput, *cpuSequence, *cpuIndex);
+  gpuOutput->maxSequenceForward(*gpuInput, *gpuSequence, *gpuIndex);
+
+  TensorCheckEqual(*cpuOutput, *gpuOutput);
+  TensorCheckEqual(*cpuIndex, *gpuIndex);
+
+  // backward
+  MatrixPtr cpuOutputGrad = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  MatrixPtr gpuOutputGrad = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
+  cpuOutputGrad->randomizeUniform();
+  gpuOutputGrad->copyFrom(*cpuOutputGrad);
+
+  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInputGrad->randomizeUniform();
+  gpuInputGrad->copyFrom(*cpuInputGrad);
+
+  cpuInputGrad->maxSequenceBackward(*cpuOutputGrad, *cpuSequence, *cpuIndex);
+  gpuInputGrad->maxSequenceBackward(*gpuOutputGrad, *gpuSequence, *gpuIndex);
+
+  TensorCheckEqual(*cpuInputGrad, *gpuInputGrad);
+}
+
+TEST(Matrix, maxSequence) {
+  for (auto batchSize : {1, 3, 997}) {   // prime numbers close to 1, 4, 1024
+    for (auto inputDim : {1, 7, 131}) {  // prime numbers close to 1, 8, 128
+      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
+      testMatrixMaxSequence(batchSize, inputDim);
+    }
+  }
+}
+
+void testMatrixGetSum(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+#ifndef PADDLE_TYPE_DOUBLE
+  int x = log10(height * width);
+  real err = 1e-6 * pow(10, x);
+#else
+  real err = 1e-8;
+#endif
+
+  real cpuSum = cpuInput->getSum();
+  real gpuSum = gpuInput->getSum();
+
+  EXPECT_LE(fabs(cpuSum - gpuSum), err);
+}
+
+void testMatrixGetMinMax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  real cpuMin = cpuInput->getMin();
+  real gpuMin = gpuInput->getMin();
+  real cpuMax = cpuInput->getMax();
+  real gpuMax = gpuInput->getMax();
+
+  EXPECT_EQ(cpuMin, gpuMin);
+  EXPECT_EQ(cpuMax, gpuMax);
+}
+
+void testMatrixZeroAtOffset(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuTest = std::make_shared<CpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  cpuTest->copyFrom(*cpuA);
+
+  int columnOffset = rand() % width;  // NOLINT we just use rand() for test.
+  int numColumns = rand() % (width - columnOffset);  // NOLINT
+
+  if (numColumns == 0) return;
+
+  cpuA->zeroAtOffset(columnOffset, numColumns);
+  gpuA->zeroAtOffset(columnOffset, numColumns);
+
+  /* cpuTest */
+  real* a = cpuTest->getData() + columnOffset;
+  for (int64_t i = 0; i < height; ++i) {
+    for (int64_t j = 0; j < numColumns; ++j) {
+      a[i * width + j] = 0;
+    }
+  }
+
+  TensorCheckEqual(*cpuA, *gpuA);
+  TensorCheckEqual(*cpuA, *cpuTest);
+}
+
+void testMatrixDeepSwap(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuCopyA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuCopyB = std::make_shared<CpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuCopyA->copyFrom(*cpuA);
+  cpuCopyB->copyFrom(*cpuB);
+
+  // swap matrix cpuA and cpuB
+  cpuA->deepSwap(*cpuB);
+
+  TensorCheckEqual(*cpuA, *cpuCopyB);
+  TensorCheckEqual(*cpuB, *cpuCopyA);
+}
+
+void testMatrixTranspose(int height, int width) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuT = std::make_shared<CpuMatrix>(width, height);
+  MatrixPtr gpuT = std::make_shared<GpuMatrix>(width, height);
+
+  cpu->randomizeUniform();
+  gpu->copyFrom(*cpu);
+  cpu->transpose(cpuT, false);
+  gpu->transpose(gpuT, true);
+
+  TensorCheckEqual(*cpuT, *gpuT);
+}
+
+void testMatrixRotate(int height, int width) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuR = std::make_shared<CpuMatrix>(width, height);
+  MatrixPtr gpuR = std::make_shared<GpuMatrix>(width, height);
+
+  cpu->randomizeUniform();
+  gpu->copyFrom(*cpu);
+
+  cpu->rotate(cpuR, false, true);
+  gpu->rotate(gpuR, true, true);
+  TensorCheckEqual(*cpuR, *gpuR);
+
+  cpu->rotate(cpuR, true, false);
+  gpu->rotate(gpuR, false, false);
+  TensorCheckEqual(*cpuR, *gpuR);
+}
+
+void testMatrixInverse(int height) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
+  MatrixPtr cpuI = std::make_shared<CpuMatrix>(height, height);
+  MatrixPtr gpuI = std::make_shared<GpuMatrix>(height, height);
+
+  /* Make matrix well conditioned: cpu * cpuT + Identity */
+  cpu->randomizeUniform();
+  MatrixPtr cpuT = cpu->getTranspose();
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, height);
+  outputCheck->mul(*cpu, *cpuT);
+  cpu->setDiag(1.0);
+  cpu->add(*outputCheck);
+
+  gpu->copyFrom(*cpu);
+  cpu->inverse(cpuI, true);
+  gpu->inverse(gpuI, false);
+
+  TensorCheckErr(*cpuI, *gpuI);
+
+  outputCheck->mul(*cpu, *cpuI);
+  cpu->setDiag(1.0);
+  TensorCheckErr(*cpu, *outputCheck);
+}
+
+TEST(Matrix, unary) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+
+      testMatrixDeepSwap(height, width);
+      testMatrixZeroAtOffset(height, width);
+      testMatrixGetSum(height, width);
+      testMatrixTranspose(height, width);
+      testMatrixRotate(height, width);
+    }
+#ifdef LAPACK_FOUND
+    // inverse matrix
+    testMatrixInverse(height);
+#else
+    LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
+                 << "support so we cannot test matrix inverse. To test "
+                 << "matrix inverse, please install LAPACKE "
+                 << "and MKL/Openblas, and re-build PaddlePaddle.";
+#endif
+  }
+}
+
+void testMatrixSoftmax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);
+
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+}
+
+void testSequenceSoftmax(int batchSize) {
+  // forward
+  int inputDim = 1;
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
+  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
+
+  TensorCheckErr(*cpuInput, *gpuInput);
+}
+
+void testMatrixSoftmaxThreshold(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  cpuInput->getData()[0] = 100.0;
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuOutput);
+  // check output zero
+  int cpuCount = 0;
+  int gpuCount = 0;
+  auto zeroNum = [](MatrixPtr out, int& count) {
+    for (size_t i = 0; i < out->getHeight(); i++) {
+      for (size_t j = 0; j < out->getWidth(); j++) {
+        if (out->getElement(i, j) == 0) count++;
+      }
+    }
+  };
+  zeroNum(cpuOutput, cpuCount);
+  zeroNum(outputCheck, gpuCount);
+  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
+  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
+}
+
+void testMatrixSoftmaxBp(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+  gpuOutput->softmaxBackward(*gpuInput);
+
+  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
+  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
+  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
+  sftMaxSum->colMerge(*sftMaxDot);
+  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
+
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+}
+
+TEST(Matrix, softmax) {
+  for (auto height : {1, 3, 131}) {    // prime numbers close to 1, 4, 127
+    for (auto width : {1, 17, 251}) {  // prime numbers close to 1, 16, 256
+      VLOG(3) << " height=" << height << " width=" << width;
+
+      testMatrixSoftmax(height, width);
+      testMatrixSoftmaxBp(height, width);
+      testMatrixSoftmaxThreshold(height, width);
+    }
+    testSequenceSoftmax(height);
+  }
+}
+
+void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
+  MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
+  MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
+  cpuTable->randomizeUniform();
+  gpuTable->copyFrom(*cpuTable);
+
+  IVectorPtr cpuIds;
+  IVectorPtr gpuIds;
+  cpuIds = VectorT<int>::create(numSamples, false);
+  gpuIds = VectorT<int>::create(numSamples, true);
+  cpuIds->rand(tableSize);
+  gpuIds->copyFrom(*cpuIds);
+
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, inputDim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, inputDim);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+
+  cpuOutput->addToRows(*cpuTable, *cpuIds);
+  gpuOutput->addToRows(*gpuTable, *gpuIds);
+
+  TensorCheckErr(*cpuTable, *gpuTable);
+}
+
+TEST(Matrix, tableProjection) {
+  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
+    for (auto tableSize : {10, 100}) {
+      for (auto inputDim : {20, 50}) {
+        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
+                << " inputDim=" << inputDim;
+        testMatrixAddToRows(numSamples, tableSize, inputDim);
+      }
+    }
+  }
+}
+
+void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
+  int heightA = transa == false ? dimM : dimK;
+  int widthA = transa == false ? dimK : dimM;
+  int heightB = transb == false ? dimK : dimN;
+  int widthB = transb == false ? dimN : dimK;
+  int heightC = dimM;
+  int widthC = dimN;
+
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
+
+  real alpha = 1.5;
+  real beta = 2.0;
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+
+  cpuC->mul(*cpuA, *cpuB, alpha, beta);
+  gpuC->mul(*gpuA, *gpuB, alpha, beta);
+
+  TensorCheckErr(*cpuC, *gpuC);
+}
+
+void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
+  int heightA = transa == false ? dimM : dimK;
+  int widthA = transa == false ? dimK : dimM;
+  int heightB = transb == false ? dimK : dimN;
+  int widthB = transb == false ? dimN : dimK;
+  int heightC = dimM;
+  int widthC = dimN;
+
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
+
+  real alpha = 1.5;
+  real beta = 2.0;
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+
+  auto subSize = [](int& start, int& end, int dim) {
+    if (dim == 1) {
+      start = 0;
+      end = dim;
+    } else {
+      int subDim = rand() % (dim - 1) + 1;  // NOLINT
+      start = rand() % (dim - subDim);      // NOLINT
+      end = start + subDim;
+    }
+  };
+
+  auto subMatrix = [](MatrixPtr& sub,
+                      MatrixPtr matrix,
+                      size_t startRow,
+                      size_t endRow,
+                      size_t startCol,
+                      size_t endCol) {
+    if (!matrix->isTransposed()) {
+      sub = matrix->subMatrix(startRow, endRow, startCol, endCol);
+    } else {
+      sub = matrix->subMatrix(startCol, endCol, startRow, endRow);
+    }
+  };
+
+  int startM, endM;
+  int startN, endN;
+  int startK, endK;
+  subSize(startM, endM, dimM);
+  subSize(startN, endN, dimN);
+  subSize(startK, endK, dimK);
+
+  MatrixPtr subCpuA;
+  MatrixPtr subCpuB;
+  MatrixPtr subGpuA;
+  MatrixPtr subGpuB;
+  subMatrix(subCpuA, cpuA, startM, endM, startK, endK);
+  subMatrix(subGpuA, gpuA, startM, endM, startK, endK);
+  subMatrix(subCpuB, cpuB, startK, endK, startN, endN);
+  subMatrix(subGpuB, gpuB, startK, endK, startN, endN);
+  MatrixPtr subCpuC = cpuC->subMatrix(startM, endM, startN, endN);
+  MatrixPtr subGpuC = gpuC->subMatrix(startM, endM, startN, endN);
+
+  subCpuC->mul(*subCpuA, *subCpuB, alpha, beta);
+  subGpuC->mul(*subGpuA, *subGpuB, alpha, beta);
+
+  TensorCheckErr(*cpuC, *gpuC);
+}
+
+TEST(Matrix, mul) {
+  for (auto transa : {false, true}) {
+    for (auto transb : {false, true}) {
+      for (auto dimM : {1, 9, 53, 127, 345, 1023, 2135}) {
+        for (auto dimN : {1, 5, 37, 256, 1024}) {
+          for (auto dimK : {8, 45, 346, 784, 1025}) {
+            if (true == transa && true == transb) {
+              continue;
+            }
+            VLOG(3) << setiosflags(ios::left) << setfill(' ')
+                    << " transa=" << transa << " transb=" << transb
+                    << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
+                    << dimN << " dimK=" << setw(5) << dimK;
+
+            testMatrixMul(transa, transb, dimM, dimN, dimK);
+            testSubMatrixMul(transa, transb, dimM, dimN, dimK);
+          }
+        }
+      }
+    }
+  }
+}
+
+void testVectorRowFunc(int size) {
+  CpuVectorPtr cpu = std::make_shared<CpuVectorT<real>>(size);
+  GpuVectorPtr gpu = std::make_shared<GpuVectorT<real>>(size);
+
+  cpu->rand();
+  gpu->copyFrom(*cpu);
+
+  EXPECT_EQ(cpu->getMax(), gpu->getMax());
+  EXPECT_EQ(cpu->getMin(), gpu->getMin());
+  EXPECT_EQ(cpu->getAbsMax(), gpu->getAbsMax());
+}
+
+TEST(Vector, rowFunc) {
+  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
+    VLOG(3) << " size=" << size;
+    testVectorRowFunc(size);
+  }
+}
+
+template <class T>
+void testVectorReset(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
+
+  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
+  cpu->reset(value);
+  gpu->reset(value);
+
+  TensorCheckEqual(*cpu, *gpu);
+}
+
+template <class T>
+void testVecortSelectFrom(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpuDst = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpuDst = std::make_shared<GpuVectorT<T>>(size);
+  std::shared_ptr<CpuVectorT<T>> cpuSrc =
+      std::make_shared<CpuVectorT<T>>(size * 2);
+  std::shared_ptr<GpuVectorT<T>> gpuSrc =
+      std::make_shared<GpuVectorT<T>>(size * 2);
+  CpuIVectorPtr cpuIds = std::make_shared<CpuVectorT<int>>(size);
+  GpuIVectorPtr gpuIds = std::make_shared<GpuVectorT<int>>(size);
+
+  if (std::is_same<T, real>::value) {
+    cpuSrc->rand();
+  } else {
+    cpuSrc->rand(100000);
+  }
+  gpuSrc->copyFrom(*cpuSrc);
+  cpuIds->rand(size);
+  gpuIds->copyFrom(*cpuIds);
+
+  cpuDst->selectFrom(*cpuSrc, *cpuIds);
+  gpuDst->selectFrom(*gpuSrc, *gpuIds);
+
+  TensorCheckEqual(*cpuDst, *gpuDst);
+}
+
+template <class T>
+void testVecotrZeroMem(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
+
+  cpu->zeroMem();
+  gpu->zeroMem();
+
+  TensorCheckEqual(*cpu, *gpu);
+}
+
+template <class T>
+void testVectorIsEqual(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpuA = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<CpuVectorT<T>> cpuB = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpuA = std::make_shared<GpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpuB = std::make_shared<GpuVectorT<T>>(size);
+
+  if (std::is_same<T, real>::value) {
+    cpuB->rand();
+  } else {
+    cpuB->rand(100000);
+  }
+  gpuB->copyFrom(*cpuB);
+
+  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
+  cpuA->isEqualTo(*cpuB, value);
+  gpuA->isEqualTo(*gpuB, value);
+
+  TensorCheckEqual(*cpuA, *gpuA);
+}
+
+TEST(Vector, Equal) {
+  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
+    VLOG(3) << " size=" << size;
+    testVectorReset<int>(size);
+    testVectorReset<real>(size);
+    testVecortSelectFrom<int>(size);
+    testVecortSelectFrom<real>(size);
+    testVecotrZeroMem<int>(size);
+    testVecotrZeroMem<real>(size);
+    testVectorIsEqual<int>(size);
+    testVectorIsEqual<real>(size);
+  }
+}
+
+void testMatrixTopK(int samples, int dim, int beamSize) {
+  MatrixPtr cpuSrc = std::make_shared<CpuMatrix>(samples, dim);
+  MatrixPtr gpuSrc = std::make_shared<GpuMatrix>(samples, dim);
+  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
+  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
+  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
+  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
+
+  cpuSrc->randomizeUniform();
+  gpuSrc->copyFrom(*cpuSrc);
+
+  cpuSrc->rowMax(*cpuIds, *cpuVal);
+  gpuSrc->rowMax(*gpuIds, *gpuVal);
+
+  TensorCheckEqual(*cpuVal, *gpuVal);
+}
+
+TEST(Matrix, topK) {
+  for (auto samples : {1, 17, 131}) {  // prime numbers close to 1, 16, 127
+    for (auto dim : {1, 3, 997}) {     // prime numbers close to 1, 4, 1024
+      for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
+        if (beamSize > dim) continue;
+        VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
+                << " dim=" << dim;
+        testMatrixTopK(samples, dim, beamSize);
+      }
+    }
+  }
+}
+
+void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
+  int nnz = samples * dim * ratio;
+  if (nnz < 1) nnz = 1;  // Because sparseRand in MathUtil.cpp requires this.
+  MatrixPtr cpuSrc = std::make_shared<CpuSparseMatrix>(samples, dim, nnz);
+  MatrixPtr gpuSrc = std::make_shared<GpuSparseMatrix>(samples, dim, nnz);
+  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
+  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
+  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
+  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
+
+  cpuSrc->randomizeUniform();
+  gpuSrc->copyFrom(*cpuSrc);
+  cpuVal->zero();
+  cpuIds->zero();
+  gpuVal->zero();
+  gpuIds->zero();
+
+  cpuSrc->rowMax(*cpuIds, *cpuVal);
+  gpuSrc->rowMax(*gpuIds, *gpuVal);
+
+  TensorCheckEqual(*cpuVal, *gpuVal);
+
+  IVectorPtr outCheckIds = std::make_shared<CpuIVector>(samples * beamSize);
+  outCheckIds->copyFrom(*gpuIds);
+
+  const int* data1 = cpuIds->getData();
+  const int* data2 = outCheckIds->getData();
+  size_t size = cpuIds->getSize();
+  for (size_t i = 0; i < size; i++) {
+    if (data1[i] == -1 && data1[i] != data2[i]) {
+      EXPECT_EQ(data1[i], data2[i]);
+    }
+  }
+}
+
+TEST(SMatrix, topK) {
+  for (auto samples : {1, 3, 61}) {
+    for (auto dim : {1, 3, 61}) {
+      for (auto beamSize : {1, 3, 61}) {
+        for (auto ratio : {0.01, 0.001}) {
+          if (beamSize > dim) continue;
+          VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
+                  << " dim=" << dim << " ratio=" << ratio;
+          testSMatrixTopK(samples, dim, beamSize, ratio);
+        }
+      }
+    }
+  }
+}
+
+void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  int newBatchSize = cpuSequence->getSize() - 1;
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
+  cpuOutput->zero();
+  gpuOutput->zero();
+
+  cpuOutput->sequenceAvgForward(*cpuInput, *cpuSequence, mode);
+  gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
+
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+
+  MatrixPtr cpuInGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInGrad->randomizeUniform();
+  gpuInGrad->copyFrom(*cpuInGrad);
+
+  cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode);
+  gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode);
+
+  TensorCheckErr(*cpuInGrad, *gpuInGrad);
+}
+
+TEST(Matrix, sequenceAvg) {
+  for (auto batchSize : {10, 128, 6000}) {
+    for (auto inputDim : {32, 100, 512}) {
+      for (auto mode : {0, 1, 2}) {
+        VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
+                << " mode=" << mode;
+        testMatrixSequenceAvg(batchSize, inputDim, mode);
+      }
+    }
+  }
+}
+
+void testParamReluBackwardDiff(int height,
+                               int width,
+                               int w_height,
+                               int w_width) {
+  MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
+  MatrixPtr input = CpuMatrix::create(height, width, false, false);
+  MatrixPtr diff = CpuMatrix::create(height, width, false, false);
+  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
+
+  oGrad->randomizeUniform();
+  input->randomizeUniform();
+  w->randomizeUniform();
+  diff->randomizeUniform();
+  input->add(-0.5);
+
+  MatrixPtr oGradGpu = GpuMatrix::create(height, width, false, true);
+  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
+  MatrixPtr diffGpu = CpuMatrix::create(height, width, false, true);
+  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
+
+  oGradGpu->copyFrom(*oGrad);
+  inputGpu->copyFrom(*input);
+  wGpu->copyFrom(*w);
+  diffGpu->copyFrom(*diff);
+
+  diff->paramReluBackwardDiff(*oGrad, *input, *w);
+  diffGpu->paramReluBackwardDiff(*oGradGpu, *inputGpu, *wGpu);
+
+  TensorCheckErr(*diff, *diffGpu);
+}
+
+TEST(Matrix, paramReluBackwardDiff) {
+  for (auto height : {10, 40, 100}) {
+    for (auto width : {10, 40, 100}) {
+      for (auto w_height : {1, 2}) {
+        for (auto w_width : {1, 2}) {
+          if (width % (w_height * w_width)) continue;
+          testParamReluBackwardDiff(height, width, w_height, w_width);
+        }
+      }
+    }
+  }
+}
+
+void testClassificationError(int numSamples, int dim, int topkSize) {
+  MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
+  MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
+  IVectorPtr cpuLabel = std::make_shared<CpuIVector>(numSamples);
+  IVectorPtr gpuLabel = std::make_shared<GpuIVector>(numSamples);
+
+  cpuOutput->randomizeUniform();
+  cpuLabel->rand(dim);
+  gpuOutput->copyFrom(*cpuOutput);
+  gpuLabel->copyFrom(*cpuLabel);
+
+  cpuError->classificationError(*cpuOutput, *cpuLabel, topkSize);
+  gpuError->classificationError(*gpuOutput, *gpuLabel, topkSize);
+
+  TensorCheckEqual(*cpuError, *gpuError);
+}
+
+TEST(Matrix, classificationError) {
+  for (auto numSamples : {1, 3, 31}) {
+    for (auto dim : {1, 3, 31}) {
+      for (auto topkSize : {1, 3, (int)rand() % dim + 1}) {
+        if (topkSize > dim) continue;
+        VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
+                << " dim= " << dim;
+        testClassificationError(numSamples, dim, topkSize);
+      }
+    }
+  }
+}
+
+void testMaxPoolFwdBwd(int numSamples,
+                       int channels,
+                       int imgSizeH,
+                       int imgSizeW,
+                       int ksizeH,
+                       int ksizeW,
+                       int strideH,
+                       int strideW,
+                       int padH,
+                       int padW) {
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->maxPoolForward(*input,
+                         imgSizeH,
+                         imgSizeW,
+                         channels,
+                         ksizeW,
+                         ksizeH,
+                         strideH,
+                         strideW,
+                         outH,
+                         outW,
+                         padH,
+                         padW);
+  targetGpu->maxPoolForward(*inputGpu,
+                            imgSizeH,
+                            imgSizeW,
+                            channels,
+                            ksizeW,
+                            ksizeH,
+                            strideH,
+                            strideW,
+                            outH,
+                            outW,
+                            padH,
+                            padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  checkMatrixEqual(target, targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxPoolBackward(*input,
+                             imgSizeH,
+                             imgSizeW,
+                             *targetGrad,
+                             *target,
+                             ksizeW,
+                             ksizeH,
+                             strideH,
+                             strideW,
+                             outH,
+                             outW,
+                             1.0,
+                             1.0,
+                             padH,
+                             padW);
+  inputGpuGrad->maxPoolBackward(*inputGpu,
+                                imgSizeH,
+                                imgSizeW,
+                                *targetGpuGrad,
+                                *targetGpu,
+                                ksizeW,
+                                ksizeH,
+                                strideH,
+                                strideW,
+                                outH,
+                                outW,
+                                1.0,
+                                1.0,
+                                padH,
+                                padW);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  checkMatrixEqual(inputGrad, targetBwdCheck);
+}
+
+void testAvgPoolFwdBwd(int numSamples,
+                       int channels,
+                       int imgSizeH,
+                       int imgSizeW,
+                       int ksizeH,
+                       int ksizeW,
+                       int strideH,
+                       int strideW,
+                       int padH,
+                       int padW) {
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->avgPoolForward(*input,
+                         imgSizeH,
+                         imgSizeW,
+                         channels,
+                         ksizeW,
+                         ksizeH,
+                         strideH,
+                         strideW,
+                         outH,
+                         outW,
+                         padH,
+                         padW);
+  targetGpu->avgPoolForward(*inputGpu,
+                            imgSizeH,
+                            imgSizeW,
+                            channels,
+                            ksizeW,
+                            ksizeH,
+                            strideH,
+                            strideW,
+                            outH,
+                            outW,
+                            padH,
+                            padW);
+
+  TensorCheckErr(*target, *targetGpu);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->avgPoolBackward(*targetGrad,
+                             imgSizeH,
+                             imgSizeW,
+                             ksizeW,
+                             ksizeH,
+                             strideH,
+                             strideW,
+                             outH,
+                             outW,
+                             1.0,
+                             1.0,
+                             padH,
+                             padW);
+  inputGpuGrad->avgPoolBackward(*targetGpuGrad,
+                                imgSizeH,
+                                imgSizeW,
+                                ksizeW,
+                                ksizeH,
+                                strideH,
+                                strideW,
+                                outH,
+                                outW,
+                                1.0,
+                                1.0,
+                                padH,
+                                padW);
+
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
+}
+
+// TODO(yi): I noticed many such blindly combinatorial tests in this
+// file.  They are no help to locate defects at all.
+TEST(Matrix, PoolFwdBwd) {
+  for (auto numSamples : {1, 3}) {
+    for (auto channels : {1, 3}) {
+      for (auto imgSizeH : {13, 17}) {
+        for (auto imgSizeW : {17, 19}) {
+          for (auto sizeX : {2, 3}) {
+            for (auto sizeY : {2, 3}) {
+              for (auto sH : {1, 2}) {
+                for (auto sW : {1, 2}) {
+                  for (auto pH : {0, (sizeY - 1) / 2}) {
+                    for (auto pW : {0, (sizeX - 1) / 2}) {
+                      VLOG(3) << " numSamples=" << numSamples
+                              << " channels=" << channels
+                              << " imgSizeH=" << imgSizeH
+                              << " imgSizeW=" << imgSizeW << " sizeX=" << sizeX
+                              << " sizeY=" << sizeY << " strideH=" << sH
+                              << " strideW=" << sW << " padingH=" << pH
+                              << " padingW=" << pW;
+                      testMaxPoolFwdBwd(numSamples,
+                                        channels,
+                                        imgSizeH,
+                                        imgSizeW,
+                                        sizeX,
+                                        sizeY,
+                                        sH,
+                                        sW,
+                                        pH,
+                                        pW);
+                      testAvgPoolFwdBwd(numSamples,
+                                        channels,
+                                        imgSizeH,
+                                        imgSizeW,
+                                        sizeX,
+                                        sizeY,
+                                        sH,
+                                        sW,
+                                        pH,
+                                        pW);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void testMaxOutFwdBwd(
+    int numSamples, int imgSizeH, int imgSizeW, int channels, int groups) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outChannels = channels / groups;
+  int outWidth = imgSizeH * imgSizeW * outChannels;
+
+  // forward
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
+  IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
+
+  input->randomizeUniform();
+  inputGpu->copyFrom(*input);
+
+  target->maxoutForward(*input, *id, outChannels, groups);
+  targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
+
+  TensorCheckErr(*target, *targetGpu);
+  TensorCheckEqual(*id, *idGpu);
+
+  // backward
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
+  inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
+
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
+}
+
+TEST(Matrix, MaxOutFwdBwd) {
+  for (auto numSamples : {5, 10}) {
+    for (auto channels : {8, 16}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          for (auto groups : {2, 4}) {
+            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
+                    << " groups=" << groups;
+            testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(CpuMatrix, copyFrom) {
+  const size_t height = 31;
+  const size_t width = 53;
+  CpuMatrix cpu(height, width);
+  GpuMatrix gpu(height, width);
+  CpuMatrix copy(height, width);
+
+  cpu.randomizeUniform();
+  gpu.copyFrom(cpu);
+  copy.copyFrom(gpu, HPPL_STREAM_DEFAULT);
+
+  TensorCheckEqual(cpu, copy);
+}
+
+void testBatch2seqPadding(int batchSize, int inputDim) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  for (int i = 0; i < int(cpuSequence->getSize()); ++i) {
+    (cpuSequence->getData())[i] += 1;  // so no way that maxSeqLen is 0;
+  }
+
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  size_t numSeq = cpuSequence->getSize() - 1;
+  size_t maxSeqLen = *std::max_element(cpuSequence->getData(),
+                                       cpuSequence->getData() + numSeq);
+
+  printf("numSeq = %ld, maxSeqLen = %ld\n", numSeq, maxSeqLen);
+  MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+
+  // hl_sequence2batch_copy_padding(gBatch->getData(),
+  //                                gpuInput->getData(),
+  //                                cpuSequence->getData(),
+  //                                inputDim,
+  //                                maxSeqLen,
+  //                                numSeq,
+  //                                false,
+  //                                true);
+  // cCheck->copyFrom(*gBatch);
+
+  // int* seqStart = cpuSequence->getData();
+  // float* batchData = cBatch->getData();
+  // float* seqData = cpuInput->getData();
+  // for (size_t i = 0; i < maxSeqLen; i++) {
+  //   for (size_t j = 0; j < numSeq; j++) {
+  //     size_t sequenceStart = seqStart[j];
+  //     size_t sequenceLength = seqStart[j + 1] - seqStart[j];
+  //     if (i < sequenceLength) {
+  //       memcpy(batchData + (i * numSeq + j) * inputDim,
+  //              seqData + (sequenceStart + i) * inputDim,
+  //              inputDim * sizeof(real));
+  //     } else {
+  //       memset(batchData + (i * numSeq + j) * inputDim,
+  //              0,
+  //              inputDim * sizeof(real));
+  //     }
+  //   }
+  // }
+
+  // TensorCheckErr(*cBatch, *cCheck);
+}
+
+TEST(Matrix, warpCTC) {
+  for (auto batchSize : {1, 3, 17}) {
+    for (auto inputDim : {1, 3, 31}) {
+      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
+      testBatch2seqPadding(batchSize, inputDim);
+    }
+  }
+}
+
+void testMaxPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = channels * imgSizeD * imgSizeH * imgSizeW;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr maxIdx = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr maxIdxGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->maxPool3DForward(*input,
+                           *maxIdx,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+  targetGpu->maxPool3DForward(*inputGpu,
+                              *maxIdxGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
+                              outD,
+                              outH,
+                              outW,
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  checkMatrixEqual(target, targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxPool3DBackward(*targetGrad,
+                               *maxIdx,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+  inputGpuGrad->maxPool3DBackward(*targetGpuGrad,
+                                  *maxIdxGpu,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  checkMatrixEqual(inputGrad, targetBwdCheck);
+}
+
+void testAvgPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = imgSizeD * imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->avgPool3DForward(*input,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+
+  targetGpu->avgPool3DForward(*inputGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
+                              outD,
+                              outH,
+                              outW,
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+
+  TensorCheckErr(*target, *targetGpu);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->avgPool3DBackward(*targetGrad,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+
+  inputGpuGrad->avgPool3DBackward(*targetGpuGrad,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
+}
+
+// TODO(yi): I noticed many such blindly combinatorial tests in this
+// file.  They are no help to locate defects at all.
+TEST(Matrix, Pool3DFwdBwd) {
+  for (auto numSamples : {1, 3}) {
+    for (auto channels : {3}) {
+      for (auto imgSizeD : {9, 16}) {
+        for (auto imgSizeH : {9, 32}) {
+          for (auto imgSizeW : {9, 32}) {
+            for (auto sizeX : {3}) {
+              for (auto sizeY : {3}) {
+                for (auto sizeZ : {3}) {
+                  for (auto sD : {2}) {
+                    for (auto sH : {2}) {
+                      for (auto sW : {2}) {
+                        for (auto pD : {0, (sizeZ - 1) / 2}) {
+                          for (auto pH : {0, (sizeY - 1) / 2}) {
+                            for (auto pW : {0, (sizeX - 1) / 2}) {
+                              VLOG(3) << " numSamples=" << numSamples
+                                      << " channels=" << channels
+                                      << " imgSizeD=" << imgSizeD
+                                      << " imgSizeH=" << imgSizeH
+                                      << " imgSizeW=" << imgSizeW
+                                      << " sizeX=" << sizeX
+                                      << " sizeY=" << sizeY
+                                      << " sizeZ=" << sizeZ << " strideD=" << sD
+                                      << " strideH=" << sH << " strideW=" << sW
+                                      << " padingD=" << pD << " padingH=" << pH
+                                      << " padingW=" << pW;
+
+                              testMaxPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                              testAvgPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  //  for (auto numSamples : {1, 3}) {
+  //    for (auto channels : {1, 3}) {
+  //      for (auto imgSizeD : {9,16}) {
+  //      for (auto imgSizeH : {9, 32}) {
+  //        for (auto imgSizeW : {9, 32}) {
+  //          for (auto sizeX : {2, 3}) {
+  //            for (auto sizeY : {2, 3}) {
+  //            for (auto sizeZ : {2,3}){
+  //              for (auto sD : {1, 2}) {
+  //              for (auto sH : {1, 2}) {
+  //                for (auto sW : {1, 2}) {
+  //                  for (auto pD : {0, (sizeZ - 1) / 2}){
+  //                  for (auto pH : {0, (sizeY - 1) / 2}) {
+  //                    for (auto pW : {0, (sizeX - 1) / 2}) {
+  //                      VLOG(3) << " numSamples=" << numSamples
+  //                              << " channels=" << channels
+  //                              << " imgSizeD=" << imgSizeD
+  //                              << " imgSizeH=" << imgSizeH
+  //                              << " imgSizeW=" << imgSizeW
+  //                              << " sizeX=" << sizeX
+  //                              << " sizeY=" << sizeY
+  //                              << " sizeZ=" << sizeZ
+  //                              << " strideD=" << sD
+  //                              << " strideH=" << sH
+  //                              << " strideW=" << sW
+  //                              << " padingD=" << pD
+  //                              << " padingH=" << pH
+  //                              << " padingW=" << pW;
+  //
+  //                      testMaxPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                      testAvgPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                    }
+  //                  }
+  //                }
+  //              }
+  //            }
+  //            }
+  //          }
+  //        }
+  //      }
+  //      }
+  //    }
+  //    }
+  //  }
+  //  }
+}
+
+void testMatrixCol2Vol(int depth, int height, int width) {
+  int channel = 3;
+  int filterX = 3, filterY = 4, filterZ = 5;
+  int strideX = 2, strideY = 2, strideZ = 2;
+  int padX = 1, padY = 1, padZ = 1;
+
+  MatrixPtr cpuImage =
+      std::make_shared<CpuMatrix>(channel, depth * height * width);
+  MatrixPtr gpuImage =
+      std::make_shared<GpuMatrix>(channel, depth * height * width);
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+
+  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
+  int outH = outputSize(height, filterY, padY, strideY, true);
+  int outW = outputSize(width, filterX, padX, strideX, true);
+
+  int colBufHeight = channel * filterZ * filterY * filterX;
+  int colBufWidth = outD * outH * outW;
+  MatrixPtr cpuColBuf = std::make_shared<CpuMatrix>(colBufHeight, colBufWidth);
+  MatrixPtr gpuColBuf = std::make_shared<GpuMatrix>(colBufHeight, colBufWidth);
+  cpuColBuf->vol2Col(cpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX);
+  gpuColBuf->vol2Col(gpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX);
+  TensorCheckEqual(*cpuColBuf, *gpuColBuf);
+
+  cpuColBuf->randomizeUniform();
+  gpuColBuf->copyFrom(*cpuColBuf);
+  cpuColBuf->col2Vol(cpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX,
+                     1.0,
+                     1.0);
+  gpuColBuf->col2Vol(gpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX,
+                     1.0,
+                     1.0);
+  TensorCheckErr(*cpuImage, *gpuImage);
+}
+
+TEST(Matrix, col2Vol) {
+  for (auto depth : {9, 16, 64}) {
+    for (auto height : {9, 11, 128}) {
+      for (auto width : {9, 32, 128}) {
+        VLOG(3) << "depth=" << depth << " height=" << height
+                << " width=" << width;
+        testMatrixCol2Vol(depth, height, width);
+      }
+    }
+  }
+}
+
+#endif
diff --git a/paddle/legacy/math/tests/test_matrixUtil.h b/paddle/legacy/math/tests/test_matrixUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb80172b1e02e6927d15d648f18ddfa3bcbab596
--- /dev/null
+++ b/paddle/legacy/math/tests/test_matrixUtil.h
@@ -0,0 +1,233 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <gtest/gtest.h>
+#include <paddle/utils/Util.h>
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+
+void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  for (size_t r = 0; r < a->getHeight(); ++r) {
+    for (size_t c = 0; c < a->getWidth(); ++c) {
+      ASSERT_FLOAT_EQ(a->getElement(r, c), b->getElement(r, c));
+    }
+  }
+}
+
+void checkSMatrixEqual(const CpuSparseMatrix& a, const CpuSparseMatrix& b) {
+  ASSERT_EQ(a.getWidth(), b.getWidth());
+  ASSERT_EQ(a.getHeight(), b.getHeight());
+  ASSERT_EQ(a.isTransposed(), b.isTransposed());
+  ASSERT_EQ(a.getFormat(), b.getFormat());
+  ASSERT_EQ(a.getElementCnt(), b.getElementCnt());
+  for (size_t r = 0; r < a.getElementCnt(); ++r) {
+    ASSERT_FLOAT_EQ(a.getValue()[r], b.getValue()[r]);
+  }
+}
+
+void checkSMatrixEqual(const CpuSparseMatrixPtr& a,
+                       const CpuSparseMatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  ASSERT_EQ(a->getFormat(), b->getFormat());
+  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
+  for (size_t r = 0; r < a->getElementCnt(); ++r) {
+    ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
+  }
+}
+
+void checkSMatrixEqual2(const CpuSparseMatrixPtr& a,
+                        const CpuSparseMatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  ASSERT_EQ(a->getFormat(), b->getFormat());
+  ASSERT_EQ(a->getValueType(), b->getValueType());
+  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
+  if (a->getFormat() == SPARSE_CSR) {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
+      }
+    }
+    for (size_t r = 0; r <= a->getHeight(); r++) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+    }
+  } else {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
+      }
+    }
+    for (size_t r = 0; r <= a->getWidth(); r++) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+    }
+  }
+}
+
+void checkSMatrixEqual2Dense(const CpuSparseMatrix& a, const CpuMatrix& b) {
+  ASSERT_EQ(a.getWidth(), b.getWidth());
+  ASSERT_EQ(a.getHeight(), b.getHeight());
+  ASSERT_EQ(a.isTransposed(), b.isTransposed());
+
+  if (a.getFormat() == SPARSE_CSC) {
+    int* rows = a.getRows();
+    for (size_t i = 0; i < a.getWidth(); i++) {
+      for (size_t j = a.getColStartIdx(i); j < a.getColStartIdx(i + 1); j++) {
+        if (a.getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(rows[j], i));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b.getElement(rows[j], i));
+        }
+      }
+    }
+  } else {
+    int* cols = a.getCols();
+    for (size_t i = 0; i < a.getHeight(); i++) {
+      for (size_t j = a.getRowStartIdx(i); j < a.getRowStartIdx(i + 1); j++) {
+        if (a.getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(i, cols[j]));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b.getElement(i, cols[j]));
+        }
+      }
+    }
+  }
+}
+
+void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
+                             const CpuMatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+
+  if (a->getFormat() == SPARSE_CSC) {
+    int* rows = a->getRows();
+    for (size_t i = 0; i < a->getWidth(); i++) {
+      for (size_t j = a->getColStartIdx(i); j < a->getColStartIdx(i + 1); j++) {
+        if (a->getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(rows[j], i));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b->getElement(rows[j], i));
+        }
+      }
+    }
+  } else {
+    int* cols = a->getCols();
+    for (size_t i = 0; i < a->getHeight(); i++) {
+      for (size_t j = a->getRowStartIdx(i); j < a->getRowStartIdx(i + 1); j++) {
+        if (a->getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(i, cols[j]));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b->getElement(i, cols[j]));
+        }
+      }
+    }
+  }
+}
+
+void checkSMatrixErr(const CpuSparseMatrixPtr& a, const CpuSparseMatrixPtr& b) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  ASSERT_EQ(a->getFormat(), b->getFormat());
+  ASSERT_EQ(a->getValueType(), b->getValueType());
+  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
+  int count = 0;
+  if (a->getFormat() == SPARSE_CSR) {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        real aVal = a->getValue()[r];
+        real bVal = b->getValue()[r];
+        if (std::abs(aVal - bVal) > err) {
+          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
+            LOG(INFO) << "a=" << aVal << "\t"
+                      << "b=" << bVal;
+            count++;
+          }
+        }
+      }
+    }
+    for (size_t r = 0; r <= a->getHeight(); r++) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+    }
+  } else {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        real aVal = a->getValue()[r];
+        real bVal = b->getValue()[r];
+        if (std::abs(aVal - bVal) > err) {
+          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
+            count++;
+          }
+        }
+      }
+    }
+    for (size_t r = 0; r <= a->getWidth(); r++) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void checkMatrixErr(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (std::abs(a - b) > err) {
+        if ((std::abs(a - b) / std::abs(a)) > (err / 10.0f)) {
+          count++;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void checkDataEqual(const real* a, const real* b, size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    ASSERT_FLOAT_EQ(a[i], b[i]);
+  }
+}
+
+}  //  namespace paddle
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/legacy/math/tests/test_perturbation.cpp
similarity index 100%
rename from paddle/math/tests/test_perturbation.cpp
rename to paddle/legacy/math/tests/test_perturbation.cpp
diff --git a/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp b/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..959c9d40b0ec67a8ae8822da7a96a0541370b956
--- /dev/null
+++ b/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+/// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
+//  so disable when
+/// only cpu version.
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/utils/Util.h"
+#include "test_matrixUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static inline int uniformRandom(int n) { return n == 0 ? 0 : rand() % n; }
+
+void testSpMatrixAddBias(int M, int N, real rate, real scale) {
+  int nnz = M * N * rate;
+
+  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
+
+  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_1);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  hl_stream_synchronize(stream);
+
+  cpuA->addBias(*cpuB, scale);
+  gpuA->addBias(*gpuB, scale);
+
+  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
+  outputCheck->copyFrom(*gpuA, stream);
+  hl_stream_synchronize(stream);
+  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
+                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
+}
+
+void testSpMatrixAddDense(int M, int N, real rate) {  // add3
+  int nnz = M * N * rate;
+
+  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(M, N);
+
+  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(M, N);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_3);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  hl_stream_synchronize(stream);
+
+  cpuA->add3(cpuB);
+  gpuA->add3(gpuB);
+
+  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
+  outputCheck->copyFrom(*gpuA, stream);
+  hl_stream_synchronize(stream);
+  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
+                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
+}
+
+void testSpMatrixMul(int M, int N, int K, real rate) {
+  int nnz = M * N * rate;
+
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(M, K);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(N, K);
+  MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz));
+
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(M, K);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(N, K);
+  MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz));
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_3);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  gpuC->copyFrom(*cpuC, stream);
+  hl_stream_synchronize(stream);
+
+  cpuC->mul(*cpuA, *cpuB->getTranspose(), 1, 1);
+  gpuC->mul(*gpuA, *gpuB->getTranspose(), 1, 1);
+
+  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
+  outputCheck->copyFrom(*gpuC, stream);
+  hl_stream_synchronize(stream);
+  checkSMatrixErr(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuC),
+                  std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
+}
+
+void testSpMatrixCollectBias(int M, int N, real rate) {
+  int nnz = M * N * rate;
+  LOG(INFO) << "nnz=" << nnz;
+
+  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
+
+  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_3);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  hl_stream_synchronize(stream);
+
+  cpuB->collectBias(*cpuA, 1);
+  gpuB->collectBias(*gpuA, 1);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, N);
+  outputCheck->copyFrom(*gpuB, stream);
+  hl_stream_synchronize(stream);
+  checkMatrixErr(*cpuB, *outputCheck);
+}
+
+TEST(SMatrix, sMatrixOp) {
+  for (auto height : {1, 11, 200}) {
+    for (auto width : {200, 2048, 20480}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+      for (auto rate : {0.02, 0.1}) {
+        testSpMatrixAddDense(height, width, rate);
+        testSpMatrixAddBias(height, width, rate, 1.0);
+      }
+    }
+  }
+}
+
+TEST(SMatrix, sMatrixMul) {
+  for (auto M : {1, 40, 128, 200}) {
+    for (auto N : {100, 2000, 20480}) {
+      for (auto K : {100, 512, 1024}) {
+        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
+        testSpMatrixMul(M, N, K, 0.05);
+      }
+    }
+  }
+}
+
+TEST(SMatrix, sMatrixCollectBias) {
+  for (auto height : {1, 128, 200}) {
+    for (auto width : {100, 2048, 20480}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+      testSpMatrixCollectBias(height, width, 0.1);
+    }
+  }
+}
+
+#endif
diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/legacy/optimizer/CMakeLists.txt
similarity index 100%
rename from paddle/optimizer/CMakeLists.txt
rename to paddle/legacy/optimizer/CMakeLists.txt
diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/legacy/optimizer/adadelta_optimizer.cc
similarity index 100%
rename from paddle/optimizer/adadelta_optimizer.cc
rename to paddle/legacy/optimizer/adadelta_optimizer.cc
diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/legacy/optimizer/adadelta_optimizer.h
similarity index 100%
rename from paddle/optimizer/adadelta_optimizer.h
rename to paddle/legacy/optimizer/adadelta_optimizer.h
diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/legacy/optimizer/adagrad_optimizer.cc
similarity index 100%
rename from paddle/optimizer/adagrad_optimizer.cc
rename to paddle/legacy/optimizer/adagrad_optimizer.cc
diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/legacy/optimizer/adagrad_optimizer.h
similarity index 100%
rename from paddle/optimizer/adagrad_optimizer.h
rename to paddle/legacy/optimizer/adagrad_optimizer.h
diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/legacy/optimizer/adam_optimizer.cc
similarity index 100%
rename from paddle/optimizer/adam_optimizer.cc
rename to paddle/legacy/optimizer/adam_optimizer.cc
diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/legacy/optimizer/adam_optimizer.h
similarity index 100%
rename from paddle/optimizer/adam_optimizer.h
rename to paddle/legacy/optimizer/adam_optimizer.h
diff --git a/paddle/optimizer/lr_policy.h b/paddle/legacy/optimizer/lr_policy.h
similarity index 100%
rename from paddle/optimizer/lr_policy.h
rename to paddle/legacy/optimizer/lr_policy.h
diff --git a/paddle/optimizer/optimizer.cc b/paddle/legacy/optimizer/optimizer.cc
similarity index 100%
rename from paddle/optimizer/optimizer.cc
rename to paddle/legacy/optimizer/optimizer.cc
diff --git a/paddle/optimizer/optimizer.h b/paddle/legacy/optimizer/optimizer.h
similarity index 100%
rename from paddle/optimizer/optimizer.h
rename to paddle/legacy/optimizer/optimizer.h
diff --git a/paddle/optimizer/parameter_optimizer.cc b/paddle/legacy/optimizer/parameter_optimizer.cc
similarity index 100%
rename from paddle/optimizer/parameter_optimizer.cc
rename to paddle/legacy/optimizer/parameter_optimizer.cc
diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/legacy/optimizer/parameter_optimizer.h
similarity index 100%
rename from paddle/optimizer/parameter_optimizer.h
rename to paddle/legacy/optimizer/parameter_optimizer.h
diff --git a/paddle/optimizer/parameter_optimizer_test.cc b/paddle/legacy/optimizer/parameter_optimizer_test.cc
similarity index 100%
rename from paddle/optimizer/parameter_optimizer_test.cc
rename to paddle/legacy/optimizer/parameter_optimizer_test.cc
diff --git a/paddle/optimizer/serialization.h b/paddle/legacy/optimizer/serialization.h
similarity index 100%
rename from paddle/optimizer/serialization.h
rename to paddle/legacy/optimizer/serialization.h
diff --git a/paddle/optimizer/serialization_test.cc b/paddle/legacy/optimizer/serialization_test.cc
similarity index 100%
rename from paddle/optimizer/serialization_test.cc
rename to paddle/legacy/optimizer/serialization_test.cc
diff --git a/paddle/optimizer/sgd_optimizer.cc b/paddle/legacy/optimizer/sgd_optimizer.cc
similarity index 100%
rename from paddle/optimizer/sgd_optimizer.cc
rename to paddle/legacy/optimizer/sgd_optimizer.cc
diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/legacy/optimizer/sgd_optimizer.h
similarity index 100%
rename from paddle/optimizer/sgd_optimizer.h
rename to paddle/legacy/optimizer/sgd_optimizer.h
diff --git a/paddle/optimizer/tensor.h b/paddle/legacy/optimizer/tensor.h
similarity index 100%
rename from paddle/optimizer/tensor.h
rename to paddle/legacy/optimizer/tensor.h
diff --git a/paddle/legacy/parameter/Argument.cpp b/paddle/legacy/parameter/Argument.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3f1d599e901110a1c9390d76c45f8b4b1f4cab2a
--- /dev/null
+++ b/paddle/legacy/parameter/Argument.cpp
@@ -0,0 +1,707 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Argument.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+#include <algorithm>
+
+namespace paddle {
+static void resizeAndCopy(MatrixPtr& dest,
+                          const MatrixPtr& src,
+                          bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    if (!dest) {
+      dest = src->clone(0, 0, useGpu);
+    } else {
+      CHECK_EQ(dest->useGpu(), useGpu);
+      dest->resize(src->getHeight(), src->getWidth());
+    }
+    dest->copyFrom(*src, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(IVectorPtr& dest,
+                          const IVectorPtr& src,
+                          bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    IVector::resizeOrCreate(dest, src->getSize(), useGpu);
+    dest->copyFrom(*src, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(ICpuGpuVectorPtr& dest,
+                          const ICpuGpuVectorPtr& src,
+                          bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    ICpuGpuVector::resizeOrCreate(dest, src->getSize(), useGpu);
+    dest->copyFrom(*src, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(MatrixPtr& dest,
+                          const MatrixPtr& src,
+                          int32_t startRow,
+                          int32_t copySize,
+                          bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startRow + copySize, src->getHeight());
+    int height = copySize;
+    int width = src->getWidth();
+    if (!dest) {
+      dest = src->clone(height, width, useGpu);
+    } else {
+      CHECK_EQ(dest->useGpu(), useGpu);
+      dest->resize(height, width);
+    }
+    MatrixPtr submat = src->subMatrix(startRow, copySize);
+    if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
+      // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.
+      // First copy it to CPU, and then copy it to the GPU.
+      MatrixPtr tmp = src->clone(height, width, false);
+      tmp->copyFrom(*submat, stream);
+      dest->copyFrom(*tmp, stream);
+    } else {
+      dest->copyFrom(*submat, stream);
+    }
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(IVectorPtr& dest,
+                          const IVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startPos + copySize, src->getSize());
+
+    int height = copySize;
+    IVector::resizeOrCreate(dest, height, useGpu);
+    dest->copyFrom(src->getData() + startPos, height, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(ICpuGpuVectorPtr& dest,
+                          const ICpuGpuVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startPos + copySize, src->getSize());
+
+    ICpuGpuVector::resizeOrCreate(dest, copySize, useGpu);
+    dest->copyFrom(*src, startPos, copySize, useGpu, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(SVectorPtr& dest,
+                          const SVectorPtr& src,
+                          bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    size_t height = src->size();
+    if (!dest) {
+      dest = std::make_shared<std::vector<std::string>>(height);
+    } else {
+      dest->resize(height);
+    }
+    std::copy_n(src->begin(), height, dest->begin());
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(SVectorPtr& dest,
+                          const SVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startPos + copySize, src->size());
+    size_t height = copySize;
+    if (!dest) {
+      dest = std::make_shared<std::vector<std::string>>(height);
+    } else {
+      dest->resize(height);
+    }
+    std::copy_n(src->begin() + startPos, height, dest->begin());
+  } else {
+    dest.reset();
+  }
+}
+
+void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
+  resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+}
+
+void Argument::resizeAndCopyFrom(const Argument& src,
+                                 bool useGpu,
+                                 hl_stream_t stream) {
+  dataId = src.dataId;
+  resizeAndCopy(value, src.value, useGpu, stream);
+  resizeAndCopy(grad, src.grad, useGpu, stream);
+  resizeAndCopy(in, src.in, useGpu, stream);
+  resizeAndCopy(ids, src.ids, useGpu, stream);
+  resizeAndCopy(sequenceStartPositions,
+                src.sequenceStartPositions,
+                false /* useGpu */,
+                stream);
+  if (src.hasSubseq()) {
+    resizeAndCopy(subSequenceStartPositions,
+                  src.subSequenceStartPositions,
+                  false /* useGpu */,
+                  stream);
+  }
+  resizeAndCopy(strs, src.strs, useGpu, stream);
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
+}
+
+int32_t Argument::resizeAndCopyFrom(const Argument& src,
+                                    int32_t startSeq,
+                                    int32_t copySize,
+                                    bool useGpu) {
+  int32_t size =
+      resizeAndCopyFrom(src, startSeq, copySize, useGpu, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  return size;
+}
+
+int32_t Argument::resizeAndCopyFrom(const Argument& src,
+                                    int32_t startSeq,
+                                    int32_t copySize,
+                                    bool useGpu,
+                                    hl_stream_t stream) {
+  dataId = src.dataId;
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
+
+  if (!src.sequenceStartPositions) {
+    // non-sequence input, copy samples directly
+    int32_t startRow = startSeq;
+    resizeAndCopy(in, src.in, startRow, copySize, useGpu, stream);
+    resizeAndCopy(value, src.value, startRow, copySize, useGpu, stream);
+    resizeAndCopy(grad, src.grad, startRow, copySize, useGpu, stream);
+    resizeAndCopy(ids, src.ids, startRow, copySize, useGpu, stream);
+    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
+    return copySize;
+  } else {
+    // sequence input
+    const int* sequence = src.sequenceStartPositions->getData(false);
+    int32_t startRow = sequence[startSeq];           // sample start from here
+    int32_t endRow = sequence[startSeq + copySize];  // sample end
+    int32_t copyFeatureSize = endRow - startRow;     // num of samples
+    resizeAndCopy(in, src.in, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(value, src.value, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(sequenceStartPositions,
+                  src.sequenceStartPositions,
+                  startSeq,
+                  copySize + 1,
+                  false,
+                  stream);
+    // modify new sequenceStartPositions
+    int* destSequences = sequenceStartPositions->getMutableData(false);
+    for (int i = 0; i < copySize + 1; i++) {
+      destSequences[i] -= startRow;
+    }
+    CHECK_EQ(destSequences[0], 0);
+    CHECK_EQ(destSequences[copySize], copyFeatureSize);
+    if (src.hasSubseq()) {
+      // sequence has sub-sequence
+      int* subSequence = src.subSequenceStartPositions->getMutableData(false);
+      int32_t subStartSeq = 0;
+      int32_t subEndSeq = 0;
+      int numSubSequences = src.getNumSubSequences();
+      for (int i = 0; i < numSubSequences + 1; i++) {
+        if (subSequence[i] == startRow) {
+          subStartSeq = i;
+        } else if (subSequence[i] == endRow) {
+          subEndSeq = i;
+          break;
+        }
+      }
+      int32_t copySubSize = subEndSeq - subStartSeq;
+      resizeAndCopy(subSequenceStartPositions,
+                    src.subSequenceStartPositions,
+                    subStartSeq,
+                    copySubSize + 1,
+                    false,
+                    stream);
+      // modify new subSequenceStartPositions
+      int* destSubSequences = subSequenceStartPositions->getMutableData(false);
+      for (int i = 0; i < copySubSize + 1; i++) {
+        destSubSequences[i] -= startRow;
+      }
+      CHECK_EQ(destSubSequences[0], 0);
+      CHECK_EQ(destSubSequences[copySubSize], copyFeatureSize);
+    }
+    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
+    return copyFeatureSize;
+  }
+}
+
+void Argument::concat(const std::vector<Argument>& args,
+                      const std::vector<int>& selectRows,
+                      const std::vector<int>& seqStartPos,
+                      const std::vector<int>& copySize,
+                      bool useGpu,
+                      hl_stream_t stream,
+                      PassType passType) {
+  CHECK(!subSequenceStartPositions)
+      << "undefined behavior for subsequence positions";
+
+  size_t batchSize = 0;
+  for (size_t i = 0; i < copySize.size(); ++i)
+    batchSize += copySize[i] * (seqStartPos[i + 1] - seqStartPos[i]);
+
+  auto copyArg = [batchSize, stream](MatrixPtr& dst,
+                                     MatrixPtr src,
+                                     int desStartRow,
+                                     int srcStartRow,
+                                     int size,
+                                     bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    size_t width = src->getWidth();
+    if (!dst) {
+      dst = src->clone(batchSize, width, useGpu);
+    } else {
+      dst->resize(batchSize, width);
+    }
+
+    MatrixPtr tmpMatrix = dst->subMatrix(desStartRow, size);
+    tmpMatrix->copyFrom(*src->subMatrix(srcStartRow, size), stream);
+  };
+
+  auto copyIds = [batchSize, stream](IVectorPtr& dst,
+                                     const IVectorPtr& src,
+                                     int desStartRow,
+                                     int srcStartRow,
+                                     int size,
+                                     bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    IVector::resizeOrCreate(dst, batchSize, useGpu);
+    dst->subVec(desStartRow, size)
+        ->copyFrom(*src->subVec(srcStartRow, size), stream);
+  };
+
+  auto copyStrs = [batchSize](SVectorPtr& dst,
+                              const SVectorPtr& src,
+                              int desStartRow,
+                              int srcStartRow,
+                              int size,
+                              bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    if (!dst) {
+      dst = std::make_shared<std::vector<std::string>>(batchSize);
+    } else {
+      dst->resize(batchSize);
+    }
+    std::copy(src->begin() + srcStartRow,
+              src->begin() + srcStartRow + size,
+              dst->begin() + desStartRow);
+  };
+
+  dataId = args[0].dataId;
+  CHECK_NE(seqStartPos.size(), 0UL);
+  int desStartRow = 0;
+  for (size_t i = 0; i < copySize.size(); ++i) {
+    int startPos = seqStartPos[i];
+    int endPos = seqStartPos[i + 1];
+    CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
+    for (int j = startPos; j < endPos; ++j) {
+      const Argument& arg = args[j - startPos];
+      CHECK_EQ(arg.dataId, dataId) << "Arguments to concatenate should have "
+                                   << "the same dataId.";
+      const int srcStartRow = selectRows[j];
+      copyArg(in, arg.in, desStartRow, srcStartRow, copySize[i], useGpu);
+      copyArg(value, arg.value, desStartRow, srcStartRow, copySize[i], useGpu);
+      if (passType != PASS_TEST) {
+        copyArg(grad, arg.grad, desStartRow, srcStartRow, copySize[i], useGpu);
+      }
+      copyIds(ids, arg.ids, desStartRow, srcStartRow, copySize[i], useGpu);
+      copyStrs(strs, arg.strs, desStartRow, srcStartRow, copySize[i], useGpu);
+      desStartRow += copySize[i];
+    }
+  }
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, seqStartPos.size(), useGpu);
+  sequenceStartPositions->copyFrom(
+      seqStartPos.data(), seqStartPos.size(), useGpu);
+}
+
+void Argument::concat(const std::vector<Argument>& args,
+                      bool useGpu,
+                      hl_stream_t stream,
+                      PassType passType) {
+  int32_t batchSize = 0;
+  int64_t numSequences = 0;
+  int64_t numSubSequences = 0;
+  for (auto& arg : args) {
+    batchSize += arg.getBatchSize();
+    numSequences += arg.getNumSequences();
+    numSubSequences += arg.getNumSubSequences();
+  }
+
+  auto copyArg = [batchSize, stream](
+      MatrixPtr& dst, MatrixPtr src, int startRow, bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    size_t width = src->getWidth();
+    if (!dst) {
+      dst = src->clone(batchSize, width, useGpu);
+    } else {
+      dst->resize(batchSize, width);
+    }
+
+    MatrixPtr tmpMatrix = dst->subMatrix(startRow, src->getHeight());
+    tmpMatrix->copyFrom(*src, stream);
+  };
+
+  auto copyIds = [batchSize, stream](
+      IVectorPtr& dst, const IVectorPtr& src, int startRow, bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    IVector::resizeOrCreate(dst, batchSize, useGpu);
+    dst->subVec(startRow, src->getSize())->copyFrom(*src, stream);
+  };
+
+  auto copyStrs = [batchSize](
+      SVectorPtr& dst, const SVectorPtr& src, int startRow, bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    if (!dst) {
+      dst = std::make_shared<std::vector<std::string>>(batchSize);
+    } else {
+      dst->resize(batchSize);
+    }
+    std::copy(src->begin(), src->end(), dst->begin() + startRow);
+  };
+
+  auto copySequencePos = [](ICpuGpuVectorPtr& dstSeq,
+                            const ICpuGpuVectorPtr& srcSeq,
+                            int dstNumSequences,
+                            int srcNumSequences,
+                            int& startSequences,
+                            int startRow) {
+    if (srcSeq) {
+      ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
+      const int* src = srcSeq->getData(false);
+      int* dest = dstSeq->getMutableData(false);
+      for (int i = 0; i < srcNumSequences + 1; ++i) {
+        dest[i + startSequences] = src[i] + startRow;
+      }
+      startSequences += srcNumSequences;
+    } else {
+      dstSeq.reset();
+    }
+  };
+
+  int startRow = 0;
+  int startSequences = 0;
+  int startSubSequences = 0;
+  dataId = args[0].dataId;
+  for (auto& arg : args) {
+    CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
+                                 << " same dataId";
+    copyArg(in, arg.in, startRow, useGpu);
+    copyArg(value, arg.value, startRow, useGpu);
+    if (passType != PASS_TEST) copyArg(grad, arg.grad, startRow, useGpu);
+    copyIds(ids, arg.ids, startRow, useGpu);
+    copySequencePos(sequenceStartPositions,
+                    arg.sequenceStartPositions,
+                    numSequences,
+                    arg.getNumSequences(),
+                    startSequences,
+                    startRow);
+    copySequencePos(subSequenceStartPositions,
+                    arg.subSequenceStartPositions,
+                    numSubSequences,
+                    arg.getNumSubSequences(),
+                    startSubSequences,
+                    startRow);
+    copyStrs(strs, arg.strs, startRow, useGpu);
+    startRow += arg.getBatchSize();
+  }
+}
+
+void Argument::splitByDataId(const std::vector<Argument>& argus,
+                             std::vector<std::vector<Argument>>* arguGroups) {
+  arguGroups->clear();
+  int lastDataId = -1;
+  for (const auto& argu : argus) {
+    if (argu.dataId == -1) {
+      // is -1, then create a new group
+      arguGroups->emplace_back();
+      lastDataId = -1;
+    } else if (argu.dataId != lastDataId) {
+      // not -1, also not equal to last Argument, then create a new group
+      arguGroups->emplace_back();
+      lastDataId = argu.dataId;
+    } else {
+      // not -1, and equal to last Argument, do nothing
+    }
+    arguGroups->back().push_back(argu);
+  }
+}
+
+void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
+  const int* starts = sequenceStartPositions->getData(false);
+  const int* subStarts =
+      hasSubseq() ? subSequenceStartPositions->getData(false) : nullptr;
+  size_t numSequences = getNumSequences();
+  seqInfo->reserve(numSequences);
+  int subSeqEnd = 0;
+  for (size_t i = 0; i < numSequences; ++i) {
+    SeqInfo info;
+    info.seqStart = starts[i];
+    info.subLevelLength = starts[i + 1] - starts[i];
+    info.seqId = i;
+    if (hasSubseq()) {
+      info.subSeqStart = subSeqEnd;
+      while (subStarts[subSeqEnd] < starts[i + 1]) {
+        ++subSeqEnd;
+      }
+      info.topLevelLength = subSeqEnd - info.subSeqStart;
+    } else {
+      info.topLevelLength = info.subLevelLength;
+      info.subSeqStart = 0;  // not used
+    }
+    seqInfo->push_back(info);
+  }
+  std::sort(
+      seqInfo->begin(), seqInfo->end(), [](const SeqInfo& a, const SeqInfo& b) {
+        return a.topLevelLength > b.topLevelLength;
+      });
+}
+
+void Argument::checkSubset() const {
+  if (getNumSequences() > getNumSubSequences()) {
+    LOG(FATAL) << "numSubSequences is less than numSequences ("
+               << getNumSubSequences() << " vs. " << getNumSequences() << ")";
+  }
+  const int* start = sequenceStartPositions->getData(false);
+  const int* subStart = subSequenceStartPositions->getData(false);
+  int seqId = 0;
+  int subSeqId = 0;
+  while (seqId < getNumSequences() && subSeqId < getNumSubSequences()) {
+    if (start[seqId] > subStart[subSeqId]) {
+      ++subSeqId;
+    } else if (start[seqId] == subStart[subSeqId]) {
+      ++subSeqId;
+      ++seqId;
+    } else {
+      LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
+    }
+  }
+  if (seqId < getNumSequences()) {
+    LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
+  }
+}
+
+void Argument::degradeSequence(const Argument& input) {
+  CHECK_EQ(input.hasSubseq(), 1UL);
+  size_t numSequences = input.getNumSequences();
+  size_t numSubSequences = input.getNumSubSequences();
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
+  int* tgtBuf = sequenceStartPositions->getMutableData(false);
+  const int* starts = input.sequenceStartPositions->getData(false);
+  const int* subStarts = input.subSequenceStartPositions->getData(false);
+  int seqId = 0;
+  for (size_t subSeqId = 0; subSeqId < numSubSequences; ++subSeqId) {
+    if (subStarts[subSeqId] == starts[seqId]) {
+      tgtBuf[seqId] = subSeqId;
+      seqId++;
+    }
+  }
+  tgtBuf[numSequences] = numSubSequences;
+}
+
+void Argument::poolSequenceWithStride(const Argument& input,
+                                      size_t stride,
+                                      ICpuGpuVectorPtr* stridePostions,
+                                      bool reversed) {
+  // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
+  // then sequenceStartPositions = [0, 2, 3, 4, 7].
+  // If reversed = false, stridePostions = [0, 5, 9, 14, 17, 22, 27, 30];
+  // else reversed = true, stridePostions = [0, 4, 9, 14, 17, 20, 25, 30]
+
+  CHECK(input.sequenceStartPositions);
+  CHECK_EQ(input.hasSubseq(), 0UL);
+  CHECK_GT(stride, 0UL) << "stride must larger than 0";
+  size_t numSequences = input.getNumSequences();
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
+  const int* starts = input.sequenceStartPositions->getData(false);
+  int* tgtBuf = sequenceStartPositions->getMutableData(false);
+  // first index of target sequence and stride positions are both 0
+  tgtBuf[0] = 0;
+  std::vector<int> stridePos;
+  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+    size_t seqLength = starts[seqId + 1] - starts[seqId];
+    stridePos.emplace_back(starts[seqId]);
+    if (seqLength == 0) {
+      // empty sequence
+      tgtBuf[seqId + 1] = tgtBuf[seqId];
+    } else {
+      int size = ceil((float)seqLength / stride);
+      tgtBuf[seqId + 1] = tgtBuf[seqId] + size;
+      for (int i = 0; i < size - 1; ++i) {
+        int cur = reversed ? starts[seqId + 1] - (size - 1 - i) * stride
+                           : stridePos.back() + stride;
+        stridePos.emplace_back(cur);
+      }
+    }
+  }
+  stridePos.emplace_back(starts[numSequences]);
+  int size = stridePos.size();
+  CHECK_EQ(size - 1, tgtBuf[numSequences]);
+  ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
+  (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
+}
+
+void Argument::getValueString(
+    std::unordered_map<std::string, std::string>* out) const {
+  if (value) {
+    std::ostringstream os;
+    value->print(os);
+    out->insert({"value", os.str()});
+  }
+  if (ids) {
+    std::ostringstream os;
+    ids->print(os, ids->getSize());
+    out->insert({"ids", os.str()});
+  }
+  if (sequenceStartPositions) {
+    std::ostringstream os;
+    sequenceStartPositions->getVector(false)->print(
+        os, sequenceStartPositions->getSize());
+    out->insert({"sequence pos", os.str()});
+  }
+  if (subSequenceStartPositions) {
+    std::ostringstream os;
+    subSequenceStartPositions->getVector(false)->print(
+        os, subSequenceStartPositions->getSize());
+    out->insert({"sub-sequence pos", os.str()});
+  }
+}
+
+void Argument::printValueString(std::ostream& stream,
+                                const std::string& prefix) const {
+  std::unordered_map<std::string, std::string> out;
+  getValueString(&out);
+  for (auto field : {"value", "ids", "sequence pos", "sub-sequence pos"}) {
+    auto it = out.find(field);
+    if (it != out.end()) {
+      stream << prefix << field << ":\n" << it->second;
+    }
+  }
+}
+
+void Argument::subArgFrom(const Argument& input,
+                          size_t offset,
+                          size_t height,
+                          size_t width,
+                          bool useGpu,
+                          bool trans,
+                          bool seqFlag,
+                          size_t seqStart,
+                          size_t seqSize) {
+  if (input.value) {
+    value = Matrix::create(
+        input.value->getData() + offset * width, height, width, trans, useGpu);
+  }
+  if (input.ids) {
+    ids = IVector::create(input.ids->getData() + offset, height, useGpu);
+  }
+  if (input.grad) {
+    grad = Matrix::create(
+        input.grad->getData() + offset * width, height, width, trans, useGpu);
+  }
+  if (seqFlag) {
+    sequenceStartPositions = std::make_shared<ICpuGpuVector>(
+        *(input.sequenceStartPositions), seqStart, seqSize);
+  }
+}
+
+void Argument::reorganizeSeqInfo(
+    const ICpuGpuVectorPtr seqStartPos,
+    const ICpuGpuVectorPtr subSeqStartPos,
+    std::vector<std::vector<int>>& reorganizedSeqInfo) {
+  CHECK(seqStartPos);
+  reorganizedSeqInfo.clear();
+
+  int seqNum = seqStartPos->getSize() - 1;
+  int* seqStarts = seqStartPos->getMutableData(false);
+
+  if (subSeqStartPos) {
+    int* subSeqStarts = subSeqStartPos->getMutableData(false);
+    reorganizedSeqInfo.resize(seqNum, std::vector<int>());
+    int seqIdx = 0;
+    for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
+      reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+      if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
+        seqIdx++;
+        if (seqIdx == seqNum) return;
+        reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+      }
+    }
+  } else {
+    reorganizedSeqInfo.resize(1, std::vector<int>(seqNum + 1, 0));
+    memcpy(reorganizedSeqInfo[0].data(),
+           seqStarts,
+           sizeof(int) * seqStartPos->getSize());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Argument.h b/paddle/legacy/parameter/Argument.h
new file mode 100644
index 0000000000000000000000000000000000000000..f936d944cbfbf71d01528e88f7380a6052409f1e
--- /dev/null
+++ b/paddle/legacy/parameter/Argument.h
@@ -0,0 +1,349 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "hl_gpu.h"
+
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+
+typedef std::shared_ptr<std::vector<std::string>> SVectorPtr;
+
+struct Argument {
+  Argument()
+      : in(nullptr),
+        value(nullptr),
+        ids(nullptr),
+        grad(nullptr),
+        strs(nullptr),
+        frameHeight(0),
+        frameWidth(0),
+        frameDepth(0),
+        sequenceStartPositions(nullptr),
+        subSequenceStartPositions(nullptr),
+        cpuSequenceDims(nullptr),
+        deviceId(-1),
+        allCount(0),
+        valueCount(0),
+        gradCount(0),
+        dataId(0) {}
+  Argument(const Argument& argument) {
+    *this = argument;
+    valueCount = 0;
+    gradCount = 0;
+    dataId = argument.dataId;
+  }
+  ~Argument() {}
+
+  void operator=(const Argument& argument) {
+    in = argument.in;
+    value = argument.value;
+    ids = argument.ids;
+    grad = argument.grad;
+    strs = argument.strs;
+    sequenceStartPositions = argument.sequenceStartPositions;
+    subSequenceStartPositions = argument.subSequenceStartPositions;
+    cpuSequenceDims = argument.cpuSequenceDims;
+    deviceId = argument.deviceId;
+    allCount = argument.allCount;
+    frameHeight = argument.frameHeight;
+    frameWidth = argument.frameWidth;
+    frameDepth = argument.frameDepth;
+    dataId = argument.dataId;
+  }
+
+  MatrixPtr in;  // used if needed
+  MatrixPtr value;
+  IVectorPtr ids;  // a sequence of ids. Can be use for class id for costLayer
+  MatrixPtr grad;  // If empty, gradient is not needed.
+  SVectorPtr strs;
+
+  // A dataBatch includes batchSize frames, one frame maybe not only vector
+  size_t frameHeight;
+  size_t frameWidth;
+  size_t frameDepth;
+
+  // If NULL, each position is treated independently.
+  // Otherwise, its size should be #NumberOfSequences + 1.
+  // The first position is always 0 and
+  // the last position should be equal to batchSize.
+  ICpuGpuVectorPtr sequenceStartPositions;
+
+  // If NULL, each sequence has no subsequence.
+  // Otherwise, its size should be #NumberOfSubSequences + 1.
+  // The first position is always 0 and
+  // the last position should be equal to batchSize.
+  ICpuGpuVectorPtr subSequenceStartPositions;
+
+  // dimension of sequence, stored only in CPU
+  IVectorPtr cpuSequenceDims;
+
+  int deviceId;            // the GPU device id which the argument in
+  int allCount;            // the number of output layers using this argument
+  mutable int valueCount;  // waiting this member when layer do forward
+  mutable int gradCount;   // waiting this member when layer do backward
+  mutable LockedCondition valueReadyCond;
+  mutable LockedCondition gradReadyCond;
+
+  int dataId;  // dataProvider id
+
+  /* Increase the reference count of the argument. */
+  void countIncrement() { allCount++; }
+
+  int getAllCount() const { return allCount; }
+
+  void waitValueReady() const {
+    valueReadyCond.wait([this] { return (valueCount != 0); });
+
+    std::lock_guard<std::mutex> guard(*valueReadyCond.mutex());
+    valueCount--;
+  }
+
+  void notifyValueReady() const {
+    valueReadyCond.notify_all([this] { valueCount = allCount; });
+  }
+
+  void waitGradReady() const {
+    gradReadyCond.wait([this] { return (gradCount == allCount); });
+    gradCount = 0;
+  }
+
+  void notifyGradReady() const {
+    gradReadyCond.notify_all([this] { gradCount++; });
+  }
+
+  int64_t getBatchSize() const {
+    if (value) return value->getHeight();
+    if (ids) return ids->getSize();
+    if (grad) return grad->getHeight();
+    if (in) return in->getHeight();
+    if (strs) return strs->size();
+    return 0;
+  }
+  size_t getFrameHeight() const { return frameHeight; }
+  size_t getFrameWidth() const { return frameWidth; }
+  size_t getFrameDepth() const { return frameDepth; }
+  void setFrameHeight(size_t h) { frameHeight = h; }
+  void setFrameWidth(size_t w) { frameWidth = w; }
+  void setFrameDepth(size_t d) { frameDepth = d; }
+
+  int64_t getNumSequences() const {
+    return sequenceStartPositions ? sequenceStartPositions->getSize() - 1
+                                  : getBatchSize();
+  }
+
+  int64_t getNumSubSequences() const {
+    return subSequenceStartPositions ? subSequenceStartPositions->getSize() - 1
+                                     : getBatchSize();
+  }
+
+  bool hasSeq() const { return sequenceStartPositions != nullptr; }
+  bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
+
+  const int* getCpuStartPositions() const {
+    return hasSubseq() ? subSequenceStartPositions->getData(false)
+                       : sequenceStartPositions->getData(false);
+  }
+
+  static inline real sum(const std::vector<Argument>& arguments) {
+    real cost = 0;
+    for (auto& arg : arguments) {
+      if (arg.value) {
+        SetDevice device(arg.deviceId);
+        cost += arg.value->getSum();
+      }
+    }
+    return cost;
+  }
+
+  /**
+   * @brief (value, ids, grad, sequenceStartPositions) of output are subset of
+   *        input. Note that, output share the same memory of input.
+   *
+   * @param input[in]       input
+   * @param offset[in]      offset in terms of rows
+   * @param height[in]      height of output.value
+   * @param width[in]       width of output.value
+   * @param useGpu[in]
+   * @param trans[in]       whether input.value is transform
+   * @param seqFlag[in]     whether input has sequenceStartPositions
+   * @param seqStart[in]    offset of input.sequenceStartPositions
+   * @param seqSize[in]     lenght of output.sequenceStartPositions
+   */
+  void subArgFrom(const Argument& input,
+                  size_t offset,
+                  size_t height,
+                  size_t width,
+                  bool useGpu,
+                  bool trans = false,
+                  bool seqFlag = false,
+                  size_t seqStart = 0,
+                  size_t seqSize = 0);
+  /*
+   * for sequence input:
+   *   startSeq: the sequence id of start
+   *   copySize: how many sequences need to copy
+   *   return value: how many samples are copied
+   * for non-sequence input:
+   *   startSeq: the sample id of start
+   *   copySize: how many samples need to copy
+   *   return value: how many samples are copied
+   * Note that when specifying the stream explicitly in this case,
+   * synchronize should also be called somewhere after this function
+   */
+  int32_t resizeAndCopyFrom(const Argument& src,
+                            int32_t startSeq,
+                            int32_t copySize,
+                            bool useGpu,
+                            hl_stream_t stream);
+
+  /*
+   * same with the above function, except that the stream is
+   * HPPL_STREAM_DEFAULT and synchronize is automatically called
+   * inside it
+   */
+  int32_t resizeAndCopyFrom(const Argument& src,
+                            int32_t startSeq,
+                            int32_t copySize,
+                            bool useGpu = FLAGS_use_gpu);
+
+  void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
+
+  /*
+   * same with the above function, except that the stream is
+   * HPPL_STREAM_DEFAULT and synchronize is automatically called
+   * inside it
+   */
+  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
+
+  /*
+    @brief Concatenate several arguments into one and put the result into it.
+    @param args : a vector of argument, each element of which is a frame in a
+    batch of sequences.
+    @param selectRows : select several row of args to concatenate
+    @param seqStartPos : sequence start positions in the final Argument
+    @param hl_stream_t : cuda stream
+    @param passTyoe : type of task, training or testing
+   */
+  void concat(const std::vector<Argument>& args,
+              const std::vector<int>& selectRows,
+              const std::vector<int>& seqStartPos,
+              const std::vector<int>& copySize,
+              bool useGpu,
+              hl_stream_t stream,
+              PassType passType);
+
+  /*
+    Concatenate several args into one and put the result into this.
+   */
+  void concat(const std::vector<Argument>& src,
+              bool useGpu = FLAGS_use_gpu,
+              hl_stream_t stream = HPPL_STREAM_DEFAULT,
+              PassType passType = PASS_TEST);
+
+  /*
+   * split vector<Argument> to several vectors according to dataId
+   */
+  static void splitByDataId(const std::vector<Argument>& argus,
+                            std::vector<std::vector<Argument>>* arguGroups);
+
+  struct SeqInfo {
+    // Equal to sequence length for sequence data
+    // Equal to number of subsequences for subsequence data
+    int topLevelLength;
+
+    int seqStart;
+    int seqId;
+
+    // Equal to topLevelLength for sequence data
+    // Equal to sum of the length of subsequences for subsequence data
+    int subLevelLength;
+
+    // Only used for subsequence data, start position of this sequence
+    // is subSequenceStartPositions, i.e.
+    // subSequenceStartPositions[subSeqStart] == seqStart
+    int subSeqStart;
+  };
+  /*
+    Get SeqInfo for each sequence of this argument
+    Elements in *seqInfo are sorted by topLevelLength in descending order
+  */
+  void getSeqInfo(std::vector<SeqInfo>* segInfo) const;
+
+  /*
+   Check Whether sequenceStartPositions is subset of
+   subSequenceStartPositions.
+   */
+  void checkSubset() const;
+
+  /*
+   sequence has sub-sequence degrades to a sequence.
+   */
+  void degradeSequence(const Argument& input);
+
+  /*
+   After pooling with stride n (n is smaller than sequence length),
+   a long sequence will be shorten.
+   This function is invalid for sequence having sub-sequence.
+   */
+  void poolSequenceWithStride(const Argument& input,
+                              size_t stride,
+                              ICpuGpuVectorPtr* stridePositions,
+                              bool reversed = false);
+  /**
+   * @brief getValueString will return the argument's output in string. There
+   * are several kinds of output. The keys of output dictionary are 'value',
+   * 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param out [out]: the return values.
+   */
+  void getValueString(std::unordered_map<std::string, std::string>* out) const;
+
+  /**
+   * @brief printValueString will print the argument's output in order of
+   * 'value', 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param stream: Output stream
+   * @param prefix: line prefix for printing.
+   */
+  void printValueString(std::ostream& stream,
+                        const std::string& prefix = "") const;
+
+  /**
+   * @brief reorganizeSeqInfo will reorganize sequenceStartPositions and
+   * subSequenceStartPositions into a 2 dimensional arrary: reorganizedSeqInfo.
+   *
+   * @param seqStartPos: sequenceStartPositions of an Argument.
+   * @param subSeqStartPos: subSequenceStartPositions of an Argument.
+   * @param the reorganized sequence start position information.
+   *
+   * Examples:
+   * seqStartPos: [0, 4, 15, 20, 28]
+   * subSeqStartPos: [0, 3, 4, 5, 7, 10, 15, 20, 22, 23, 25, 28]
+   * reorganizedSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   */
+  static void reorganizeSeqInfo(
+      const ICpuGpuVectorPtr seqStartPos,
+      const ICpuGpuVectorPtr subSeqStartPos,
+      std::vector<std::vector<int>>& reorganizedSeqInfo);
+};
+
+}  // namespace paddle
diff --git a/paddle/parameter/AverageOptimizer.cpp b/paddle/legacy/parameter/AverageOptimizer.cpp
similarity index 100%
rename from paddle/parameter/AverageOptimizer.cpp
rename to paddle/legacy/parameter/AverageOptimizer.cpp
diff --git a/paddle/parameter/AverageOptimizer.h b/paddle/legacy/parameter/AverageOptimizer.h
similarity index 100%
rename from paddle/parameter/AverageOptimizer.h
rename to paddle/legacy/parameter/AverageOptimizer.h
diff --git a/paddle/parameter/CMakeLists.txt b/paddle/legacy/parameter/CMakeLists.txt
similarity index 100%
rename from paddle/parameter/CMakeLists.txt
rename to paddle/legacy/parameter/CMakeLists.txt
diff --git a/paddle/legacy/parameter/FirstOrderOptimizer.cpp b/paddle/legacy/parameter/FirstOrderOptimizer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..89bb840f82c30081143c99c38c02a770bf0e1b96
--- /dev/null
+++ b/paddle/legacy/parameter/FirstOrderOptimizer.cpp
@@ -0,0 +1,330 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FirstOrderOptimizer.h"
+#include "paddle/legacy/math/TrainingAlgorithmOp.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Util.h"
+
+#include <cmath>
+
+DEFINE_bool(log_clipping, false, "enable log clipping or not");
+
+namespace paddle {
+
+SparseMomentumParameterOptimizer::SparseMomentumParameterOptimizer(
+    const OptimizationConfig& optConfig)
+    : ParameterOptimizer(optConfig) {
+  addParameterType(PARAMETER_MOMENTUM);
+  addParameterType(PARAMETER_MOMENTUM_UT);
+  addParameterType(PARAMETER_MOMENTUM_VT);
+  alpha_ = 1;
+  beta_ = 1;
+  tau_ = -1;
+  threshold_ = 1e+06;
+}
+
+void SparseMomentumParameterOptimizer::init(size_t numRows,
+                                            const ParameterConfig* config) {
+  isParameterSparse_ = numRows != 0;
+  t0Vec_.resize(numRows);
+  t0Vec_.assign(t0Vec_.size(), 0);
+  timer_ = 0;
+  momentum_ = config->momentum();
+  decayRate_ = config->decay_rate();
+  gamma_ = config->learning_rate();
+}
+
+void SparseMomentumParameterOptimizer::startBatch(int64_t numSamplesProcessed) {
+  learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  if (isParameterSparse_) {
+    tau_ = tau_ + beta_ / alpha_;
+    alpha_ = alpha_ / momentum_;
+    beta_ = beta_ / (1 + decayRate_ * gamma_ * learningRate_);
+  }
+}
+
+void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[],
+                                              const ParameterConfig& paraConfig,
+                                              size_t sparseId) const {
+  if (sparseId != -1LU) {
+    CHECK_LT(sparseId, t0Vec_.size());
+    if (t0Vec_[sparseId] == 0) {
+      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
+      t0Vec_[sparseId] = 1;
+    }
+    vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
+                                     -alpha_ * gamma_ * learningRate_);
+    vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
+                                     tau_ * alpha_ * gamma_ * learningRate_);
+    vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
+                               tau_ / beta_ + 1.0 / alpha_,
+                               *vecs[PARAMETER_MOMENTUM_VT],
+                               1.0 / beta_);
+
+  } else {
+    vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                     *vecs[PARAMETER_MOMENTUM],
+                                     learningRate_ * paraConfig.learning_rate(),
+                                     paraConfig.momentum(),
+                                     applyDecay_ ? paraConfig.decay_rate() : 0);
+  }
+}
+
+ParameterOptimizer::TraverseCallback
+SparseMomentumParameterOptimizer::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  if (alpha_ > threshold_ && isParameterSparse_) {
+    //  Restart to avoid large value multiplication
+    //  1. \alpha = 1, \beta = 1, \tau = 0
+    //  2. Note that \tau * u_t + v_t = \beta \theta_t, therefore:
+    //     u_t should be rescaled to u_t/alpha_
+    //     v_t should be reset to \theta_t
+    return [this](const VectorPtr vecs[],
+                  const ParameterConfig& config,
+                  size_t sparseId) {
+      vecs[PARAMETER_MOMENTUM_UT]->divScalar(alpha_);
+      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
+    };
+  } else {
+    return nullptr;
+  }
+}
+
+void SparseMomentumParameterOptimizer::finishBatch() {
+  timer_++;
+  if (!isParameterSparse_) return;
+  if (alpha_ > threshold_) {
+    alpha_ = 1;
+    beta_ = 1;
+    tau_ = -1;
+  }
+}
+
+void AdagradParameterOptimizer::update(const VectorPtr vecs[],
+                                       const ParameterConfig& config,
+                                       size_t sparseId) const {
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  adagradApply(value,
+               grad,
+               mom,
+               accum_buffer,
+               accum,
+               lr,
+               epsilon,
+               learningRate,
+               momentum,
+               decayRate);
+}
+
+ParameterOptimizer::TraverseCallback
+AdagradParameterOptimizer::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  if (numUpdates_ % kMaxNumAccumulates == 0) {
+    // Move the sum to a different buffer to avoid loss of precision
+    // due to too many sums.
+    return [](const VectorPtr vecs[],
+              const ParameterConfig& config,
+              size_t sparseId) {
+      vecs[PARAMETER_GRADIENT_SQURESUM]->add(
+          *vecs[PARAMETER_GRADIENT_SQURESUM1]);
+      vecs[PARAMETER_GRADIENT_SQURESUM1]->zeroMem();
+    };
+  } else {
+    return nullptr;
+  }
+}
+
+void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
+                                        const ParameterConfig& config,
+                                        size_t sparseId) const {
+  CHECK(sparseId == -1LU) << "Sparse update is not supported";
+
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  adadeltaApply(value,
+                grad,
+                mom,
+                accum,
+                accum_update,
+                lr,
+                rou_,
+                epsilon_,
+                learningRate,
+                momentum,
+                decayRate);
+}
+
+void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
+                                       const ParameterConfig& config,
+                                       size_t sparseId) const {
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real accumulatedRou = rou_;
+  bool firstTime = timer_ == 0;
+  if (sparseId != -1LU) {
+    CHECK_LT(sparseId, t0Vec_.size());
+    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
+    firstTime = t0Vec_[sparseId] == 0;
+    t0Vec_[sparseId] = timer_ + 1;
+  }
+
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  rmspropApply(value,
+               grad,
+               mom,
+               sum,
+               sum1,
+               lr,
+               accumulatedRou,
+               rou_,
+               epsilon,
+               learningRate,
+               momentum,
+               decayRate,
+               firstTime);
+}
+
+void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
+                                              const ParameterConfig& config,
+                                              size_t sparseId) const {
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real accumulatedRou = rou_;
+  bool firstTime = timer_ == 0;
+  if (sparseId != -1LU) {
+    CHECK_LT(sparseId, t0Vec_.size());
+    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
+    firstTime = t0Vec_[sparseId] == 0;
+    t0Vec_[sparseId] = timer_ + 1;
+  }
+
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  decayedAdagradApply(value,
+                      grad,
+                      mom,
+                      sum,
+                      lr,
+                      accumulatedRou,
+                      rou_,
+                      epsilon,
+                      learningRate,
+                      momentum,
+                      decayRate,
+                      firstTime);
+}
+
+void AdamParameterOptimizer::update(const VectorPtr vecs[],
+                                    const ParameterConfig& config,
+                                    size_t sparseId) const {
+  CHECK(sparseId == -1UL) << "Sparse update is not supported";
+
+  real beta1_power = std::pow(beta1_, step_);
+  real beta2_power = std::pow(beta2_, step_);
+  real learningRate = config.learning_rate() * learningRate_;
+
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM];
+
+  adamApply(value,
+            grad,
+            mom,
+            v,
+            beta1_,
+            beta2_,
+            beta1_power,
+            beta2_power,
+            epsilon_,
+            learningRate);
+}
+
+void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
+                                      const ParameterConfig& config,
+                                      size_t sparseId) const {
+  CHECK(sparseId == -1UL) << "Sparse update is not supported";
+  real learningRate = config.learning_rate() * learningRate_;
+
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM];
+
+  adamaxApply(value, grad, mom, u, beta1_, beta2_, step_, learningRate);
+}
+
+void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
+                                           const ParameterConfig& config,
+                                           size_t sparseId) const {
+  real globalThreshold = optConfig_.gradient_clipping_threshold();
+  real localThreshold = config.gradient_clipping_threshold();
+
+  // Use local gradient clipping threshold if it's enabled,
+  // otherwise using the global one.
+  real threshold = localThreshold > 0.0f ? localThreshold : globalThreshold;
+  std::string field = localThreshold > 0.0f ? "local" : "global";
+
+  real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax();
+  if (maxAbsGrad > threshold) {
+    if (FLAGS_log_clipping) {
+      real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() /
+                        vecs[PARAMETER_GRADIENT]->getSize();
+      LOG(INFO) << "parameter=" << config.name() << " need clipping by "
+                << field << " threshold=" << threshold
+                << ", max grad=" << maxAbsGrad << ", avg grad=" << avgAbsGrad;
+    }
+    vecs[PARAMETER_GRADIENT]->clip(-threshold, threshold);
+  }
+  optimizer_->update(vecs, config, sparseId);
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/legacy/parameter/FirstOrderOptimizer.h
similarity index 100%
rename from paddle/parameter/FirstOrderOptimizer.h
rename to paddle/legacy/parameter/FirstOrderOptimizer.h
diff --git a/paddle/parameter/LearningRateScheduler.cpp b/paddle/legacy/parameter/LearningRateScheduler.cpp
similarity index 100%
rename from paddle/parameter/LearningRateScheduler.cpp
rename to paddle/legacy/parameter/LearningRateScheduler.cpp
diff --git a/paddle/parameter/LearningRateScheduler.h b/paddle/legacy/parameter/LearningRateScheduler.h
similarity index 100%
rename from paddle/parameter/LearningRateScheduler.h
rename to paddle/legacy/parameter/LearningRateScheduler.h
diff --git a/paddle/parameter/OptimizerFunctions.cpp b/paddle/legacy/parameter/OptimizerFunctions.cpp
similarity index 100%
rename from paddle/parameter/OptimizerFunctions.cpp
rename to paddle/legacy/parameter/OptimizerFunctions.cpp
diff --git a/paddle/parameter/OptimizerFunctions.h b/paddle/legacy/parameter/OptimizerFunctions.h
similarity index 100%
rename from paddle/parameter/OptimizerFunctions.h
rename to paddle/legacy/parameter/OptimizerFunctions.h
diff --git a/paddle/parameter/OptimizerWithRegularizer.cpp b/paddle/legacy/parameter/OptimizerWithRegularizer.cpp
similarity index 100%
rename from paddle/parameter/OptimizerWithRegularizer.cpp
rename to paddle/legacy/parameter/OptimizerWithRegularizer.cpp
diff --git a/paddle/parameter/OptimizerWithRegularizer.h b/paddle/legacy/parameter/OptimizerWithRegularizer.h
similarity index 100%
rename from paddle/parameter/OptimizerWithRegularizer.h
rename to paddle/legacy/parameter/OptimizerWithRegularizer.h
diff --git a/paddle/legacy/parameter/Parameter.cpp b/paddle/legacy/parameter/Parameter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d00019027b5f9da1ceba4392d606a90602c0b7a1
--- /dev/null
+++ b/paddle/legacy/parameter/Parameter.cpp
@@ -0,0 +1,425 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Parameter.h"
+#include <gflags/gflags.h>
+#include <fstream>
+#include "AverageOptimizer.h"
+#include "FirstOrderOptimizer.h"
+#include "OptimizerFunctions.h"
+#include "OptimizerWithRegularizer.h"
+#include "ParameterUpdateFunctions.h"
+#include "ThreadLocalBuffer.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/math/CpuSparseMatrix.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/SparseRowMatrix.h"
+#include "paddle/utils/Logging.h"
+
+DEFINE_int32(enable_grad_share,
+             (100 * 1024 * 1024),
+             "threshold for enable gradient parameter share for batch "
+             "multi-cpu training");
+DEFINE_int32(
+    grad_share_block_num,
+    64,
+    "block number of gradient parameter share for batch multi-cpu training");
+
+namespace paddle {
+
+const std::string Parameter::kMissParameterFail = "fail";
+const std::string Parameter::kMissParameterRand = "rand";
+const std::string Parameter::kMissParameterZero = "zero";
+
+Parameter::Parameter(const ParameterConfig& config, bool useGpu, bool doInit)
+    : config_(config),
+      useGpu_(useGpu),
+      deviceId_(-1),
+      sharedCount_(0),
+      updateCounter_(0),
+      updated_(false),
+      headerFormat_(PARAM_FORMAT_ORIGINAL) {
+  setID(-1); /* capture uninitialized id */
+  if (useGpu_ && FLAGS_parallel_nn) {
+    /* gpu environment is specified by device property */
+    deviceId_ = config_.device();
+    if (deviceId_ < 0) {
+      useGpu_ = false;
+    }
+  }
+
+  if (doInit) {
+    initialize();
+  }
+
+  for (int i = 0; i < config.update_hooks_size(); ++i) {
+    this->updaterHooks_.push_back(IParameterUpdaterHook::create(config, i));
+  }
+}
+
+void Parameter::initialize() {
+  SetDevice device(deviceId_);
+
+  bufs_[PARAMETER_VALUE] =
+      Vector::createParallelVector(config_.size(), useGpu_);
+  bufs_[PARAMETER_VALUE]->zeroMem();
+
+  if (config_.is_sparse()) {
+    enableSparseParameter();
+  }
+
+  if (!isStatic()) {
+    bufs_[PARAMETER_GRADIENT] =
+        Vector::createParallelVector(config_.size(), useGpu_);
+    bufs_[PARAMETER_MOMENTUM] =
+        Vector::createParallelVector(config_.size(), useGpu_);
+
+    bufs_[PARAMETER_GRADIENT]->zeroMem();
+    bufs_[PARAMETER_MOMENTUM]->zeroMem();
+  }
+}
+
+void Parameter::randomize(const VectorPtr& value,
+                          const ParameterConfig& config) {
+  if (PARAMETER_INIT_UNIFORM == config.initial_strategy()) {
+    // initialize the parameter as uniform distribution
+    real initial_min = config.initial_mean() - config.initial_std();
+    real initial_max = config.initial_mean() + config.initial_std();
+    value->uniform(initial_min, initial_max);
+    VLOG(1) << config.name() << ": initial_min=" << initial_min
+            << ", initial_max=" << initial_max;
+  } else if (PARAMETER_INIT_NORMAL == config.initial_strategy()) {
+    /* Initialize the parameters randomly */
+    value->randnorm(config.initial_mean(), config.initial_std());
+    VLOG(1) << config.name() << ": initial_mean=" << config.initial_mean()
+            << ", initial_std=" << config.initial_std();
+  } else {
+    LOG(FATAL) << "not supported initial_strategy: "
+               << config.initial_strategy();
+  }
+}
+
+void Parameter::randomize() {
+  if (!bufs_[PARAMETER_VALUE]) return;
+  SetDevice device(deviceId_);
+  Parameter::randomize(bufs_[PARAMETER_VALUE], config_);
+
+  if (config_.is_sparse()) {
+    if (format_ == SPARSE_CSC) {
+      sparseRand(intBufs_[PARAMETER_COLS]->getData(),
+                 intBufs_[PARAMETER_ROWS]->getData(),
+                 config_.size(),
+                 config_.dims(1) + 1,
+                 config_.dims(0),
+                 useGpu_);
+    } else {
+      sparseRand(intBufs_[PARAMETER_ROWS]->getData(),
+                 intBufs_[PARAMETER_COLS]->getData(),
+                 config_.size(),
+                 config_.dims(0) + 1,
+                 config_.dims(1),
+                 useGpu_);
+    }
+  }
+  setValueUpdated();
+}
+
+void Parameter::zeroMem() {
+  if (!bufs_[PARAMETER_VALUE]) return;
+  bufs_[PARAMETER_VALUE]->zeroMem();
+  setValueUpdated();
+  LOG(INFO) << getName() << " set to 0";
+}
+
+bool Parameter::isGradShared(size_t* blockNum) {
+  if (!useGpu_ && !isStatic() && FLAGS_enable_grad_share > 0 &&
+      !isGradSparseUpdate() &&
+      this->getSize() > (size_t)FLAGS_enable_grad_share) {
+    if (blockNum) {
+      *blockNum = (size_t)FLAGS_grad_share_block_num;
+    }
+    return true;
+  }
+  return false;
+}
+
+bool Parameter::isValueShared() {
+  return !useGpu_ && config_.is_shared() && FLAGS_trainer_count > 1;
+}
+
+bool Parameter::isGradSparseUpdate() const {
+  return !useGpu_ && !isStatic() &&
+         (config_.sparse_update() || config_.sparse_remote_update());
+}
+
+void Parameter::setMat(ParameterType pType, int matType) {
+  CHECK(!mats_[pType]);
+
+  if (config_.dims_size() == 0 && matType == MAT_NORMAL) {
+    return;
+  }
+
+  CHECK_EQ((size_t)config_.dims_size(), 2LU);
+  size_t height = config_.dims(0);
+  size_t width = config_.dims(1);
+  if (matType == MAT_NORMAL) {
+    if (!config_.is_sparse()) {
+      CHECK_EQ(height * width, bufs_[pType]->getSize());
+      mats_[pType] =
+          Matrix::create(bufs_[pType]->getMemoryHandle(), height, width);
+    } else {
+      size_t size = bufs_[pType]->getSize();
+      CHECK_GE(height * width, size);
+      if (format_ == SPARSE_CSR) {
+        CHECK_EQ(height + 1, intBufs_[PARAMETER_ROWS]->getSize());
+        CHECK_EQ(size, intBufs_[PARAMETER_COLS]->getSize());
+      } else {
+        CHECK_EQ(width + 1, intBufs_[PARAMETER_COLS]->getSize());
+        CHECK_EQ(size, intBufs_[PARAMETER_ROWS]->getSize());
+      }
+      mats_[pType] =
+          Matrix::createSparseMatrix(bufs_[pType]->getData(),
+                                     intBufs_[PARAMETER_ROWS]->getData(),
+                                     intBufs_[PARAMETER_COLS]->getData(),
+                                     height,
+                                     width,
+                                     bufs_[pType]->getSize(),
+                                     FLOAT_VALUE,
+                                     format_,
+                                     false,
+                                     useGpu_);
+    }
+  }
+#ifndef PADDLE_MOBILE_INFERENCE
+  // NOLINTNEXTLINE
+  else if (matType == MAT_NORMAL_SHARED) {
+    CHECK_EQ(height * width, bufs_[pType]->getSize());
+    size_t blockNum = 0;
+    CHECK(isGradShared(&blockNum));
+    mats_[pType] = std::make_shared<SharedCpuMatrix>(
+        blockNum,
+        std::dynamic_pointer_cast<CpuMemoryHandle>(
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
+  } else if (matType == MAT_VALUE_SHARED) {
+    CHECK_EQ(height * width, bufs_[pType]->getSize());
+    mats_[pType] = std::make_shared<SharedCpuMatrix>(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
+  } else if (matType == MAT_SPARSE_ROW_IDS) {
+    CHECK_EQ(height * width, bufs_[pType]->getSize());
+    mats_[pType] = std::make_shared<SparseRowIdsCpuMatrix>(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
+  } else if (matType == MAT_SPARSE_ROW) {
+    auto valueMat =
+        std::dynamic_pointer_cast<SparseRowCpuMatrix>(mats_[PARAMETER_VALUE]);
+    SparseRowCpuMatrix::IndexDictPtr indexDict(nullptr);
+    if (pType != PARAMETER_VALUE) {
+      CHECK(valueMat) << "The matrix for PARAMETER_VALUE must be set "
+                      << " and its type must be MAT_SPARSE_ROW,"
+                      << " MAT_SPARSE_ROW_PREFETCH or MAT_CACHE_ROW";
+      indexDict = valueMat->getIndexDictHandle();
+    }
+    auto mat =
+        std::make_shared<SparseRowCpuMatrix>(nullptr,
+                                             height,
+                                             width,
+                                             // grad share index with value
+                                             indexDict);
+    mats_[pType] = mat;
+  } else if (matType == MAT_CACHE_ROW) {
+    CHECK(isGradSparseUpdate());
+    auto mat = std::make_shared<CacheRowCpuMatrix>(height, width);
+    mats_[pType] = mat;
+  } else if (matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
+             matType == MAT_SPARSE_ROW_PREFETCH) {
+    auto mat = std::make_shared<SparsePrefetchRowCpuMatrix>(
+        bufs_[pType] ? std::dynamic_pointer_cast<CpuMemoryHandle>(
+                           bufs_[pType]->getMemoryHandle())
+                     : nullptr,
+        height,
+        width,
+        nullptr,  // indexDictHandle
+        getGlobalSyncThreadPool());
+    mats_[pType] = mat;
+  } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
+    CHECK(isGradSparseUpdate());
+    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
+  }
+#endif
+  // NOLINTNEXTLINE
+  else {
+    LOG(FATAL) << "Unsupported mat type" << matType;
+  }
+}
+
+void Parameter::incUpdate(const UpdateCallback& callback) {
+  // Static parameter is fixed, and does not need to be updated
+  if (isStatic()) {
+    return;
+  }
+
+  ++updateCounter_;
+  if (isUpdatable()) {
+    if (callback) callback(this);
+    clearUpdate();
+  }
+}
+
+bool Parameter::save(const std::string& filename) const {
+  std::ofstream fs(filename, std::ios_base::binary);
+  CHECK(fs) << "Fail to open " << filename;
+  return save(fs);
+}
+
+bool Parameter::save(std::ostream& s) const {
+  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
+  Header header;
+  header.format = headerFormat_;
+  header.valueSize = sizeof(real);
+  header.size = getSize();
+
+  CHECK_EQ(header.size, vec.getSize());
+
+  CHECK(s.write(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to write parameter " << getName();
+
+  CHECK(s.write(reinterpret_cast<char*>(vec.getData()),
+                header.size * sizeof(real)))
+      << "Fail to write parameter " << getName();
+  if (config_.is_sparse()) {
+    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
+    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
+    CHECK(s.write(reinterpret_cast<char*>(rows.getData()),
+                  rows.getSize() * sizeof(int)))
+        << "Fail to write parameter " << getName();
+    CHECK(s.write(reinterpret_cast<char*>(cols.getData()),
+                  cols.getSize() * sizeof(int)))
+        << "Fail to write parameter " << getName();
+  }
+
+  return true;
+}
+
+/**
+ * Load parameter value from a file
+ */
+bool Parameter::load(const std::string& filename) {
+  std::ifstream fs(filename, std::ios_base::binary);
+  if (!fs) {
+    LOG(INFO) << "missing parameters [" << filename << "] while loading model.";
+    if (kMissParameterFail == FLAGS_load_missing_parameter_strategy) {
+      LOG(FATAL) << getName() << " missing, not allowed.";
+      return false;
+    }
+    if (kMissParameterRand == FLAGS_load_missing_parameter_strategy) {
+      LOG(INFO) << getName() << " missing, set to random.";
+      randomize();
+      return true;
+    }
+    if (kMissParameterZero == FLAGS_load_missing_parameter_strategy) {
+      LOG(INFO) << getName() << " missing, set to zero.";
+      zeroMem();
+      return true;
+    }
+    LOG(FATAL) << "unsupported load_missing_parameter_strategy: "
+               << FLAGS_load_missing_parameter_strategy;
+    return false;
+  }
+  return load(fs);
+}
+
+bool Parameter::load(std::istream& s) {
+  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
+  Header header;
+  CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to read parameter " << getName();
+  CHECK(isHeaderFormatSupported(header.format)) << "Incorrect format version: "
+                                                << header.format;
+  headerFormat_ = header.format;
+  CHECK_EQ(header.size, getSize())
+      << "The size (" << header.size << ") in the file does not match the size "
+      << "(" << getSize() << ") of the parameter: " << getName();
+  CHECK_EQ(header.valueSize, sizeof(real))
+      << "Unsupported valueSize " << header.valueSize << " at: " << getName();
+  CHECK(s.read(reinterpret_cast<char*>(vec.getData()),
+               header.size * sizeof(real)));
+
+  auto& tmp = *bufs_[PARAMETER_VALUE].get();
+  if (typeid(tmp) == typeid(GpuVector)) {
+    bufs_[PARAMETER_VALUE]->copyFrom(vec);
+  }
+
+  if (config_.is_sparse() && config_.need_compact()) {
+    // load from dense parameter with many zero
+    CHECK_EQ(config_.dims_size(), 2);
+    auto height = config_.dims(0);
+    auto width = config_.dims(1);
+    auto mat = Matrix::create(vec.getData(), height, width);
+    CpuSparseMatrix sparseMat(height,
+                              width,
+                              0,
+                              FLOAT_VALUE,
+                              format_,
+                              /*trans*/ false);
+    sparseMat.copyFrom(*mat, HPPL_STREAM_DEFAULT);
+    auto nnz = sparseMat.getElementCnt();
+    size_t rowSize = (format_ == SPARSE_CSR) ? height + 1 : nnz;
+    size_t colSize = (format_ == SPARSE_CSR) ? nnz : width + 1;
+
+    intBufs_[PARAMETER_ROWS]->copyFrom(sparseMat.getRows(), rowSize);
+    intBufs_[PARAMETER_COLS]->copyFrom(sparseMat.getCols(), colSize);
+    bufs_[PARAMETER_VALUE]->resize(nnz);  // for setMat check
+    bufs_[PARAMETER_VALUE]->copyFrom(sparseMat.getValue(), nnz);
+    config_.set_size(nnz);
+    LOG(INFO) << "compact nnz=" << (1. * nnz / (height * width))
+              << " name=" << config_.name();
+  } else if (config_.is_sparse()) {
+    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
+    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
+    size_t rowSize, colSize;
+    CHECK_EQ(config_.dims_size(), 2);
+    if (format_ == SPARSE_CSR) {
+      rowSize = config_.dims(0) + 1;
+      colSize = config_.size();
+    } else {
+      rowSize = config_.size();
+      colSize = config_.dims(1) + 1;
+    }
+    CHECK(
+        s.read(reinterpret_cast<char*>(rows.getData()), rowSize * sizeof(int)));
+    CHECK(
+        s.read(reinterpret_cast<char*>(cols.getData()), colSize * sizeof(int)));
+    auto& paramRows = *intBufs_[PARAMETER_ROWS].get();
+    if (typeid(paramRows) == typeid(GpuIVector)) {
+      intBufs_[PARAMETER_ROWS]->copyFrom(rows);
+    }
+    auto& paramCols = *intBufs_[PARAMETER_COLS].get();
+    if (typeid(paramCols) == typeid(GpuIVector)) {
+      intBufs_[PARAMETER_COLS]->copyFrom(cols);
+    }
+  }
+
+  setValueUpdated();
+
+  return true;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Parameter.h b/paddle/legacy/parameter/Parameter.h
new file mode 100644
index 0000000000000000000000000000000000000000..75cfb3f4aa6174990ce579171cb9e0e35e7e9b41
--- /dev/null
+++ b/paddle/legacy/parameter/Parameter.h
@@ -0,0 +1,380 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "ParameterConfig.pb.h"
+#include "TrainerConfig.pb.h"
+
+#include "ParameterUpdaterHook.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/utils/Common.h"
+#include "paddle/utils/GlobalConstants.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/ThreadLocal.h"
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+
+typedef enum {
+  /// The paddle original basic format
+  PARAM_FORMAT_ORIGINAL = 0,
+
+  /// See mkldnn_memory_format_t in
+  /// https://github.com/01org/mkl-dnn/blob/master/include/mkldnn_types.h
+  /// for a detailed description.
+  /// 2D weights tensor in the format (output channels, input channels).
+  PARAM_FORMAT_MKLDNN_OI,
+
+  /// The total format items numbers
+  PARAM_FORMAT_ITEMS,
+} PARAM_FORMAT;
+
+class SparsePrefetchRowCpuMatrix;
+
+class Parameter;
+typedef std::function<void(Parameter* param)> UpdateCallback;
+typedef std::function<void(int paramId, Parameter* param)> ParamInitCallback;
+
+class Parameter;
+typedef std::shared_ptr<Parameter> ParameterPtr;
+
+class Parameter {
+ public:
+  Parameter(const ParameterConfig& config, bool useGpu, bool doInit = true);
+  const std::string& getName() const { return config_.name(); }
+
+  size_t getSize() const { return config_.size(); }
+
+  bool isFullSize() const {
+    if (bufs_[PARAMETER_VALUE]) {
+      return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
+    }
+    return false;
+  }
+
+  inline bool useGpu() const { return useGpu_; }
+
+  int getDeviceId() const { return deviceId_; }
+
+  void setDevice(int deviceId) { deviceId_ = deviceId; }
+
+  /// The id ranges from 0 to the_total_number_of_parameters - 1
+  size_t getID() const { return config_.para_id(); }
+
+  /// ID is a implict value created until neural network is built.
+  void setID(size_t id) { config_.set_para_id(id); }
+
+  bool isStatic() const { return config_.is_static(); }
+
+  enum MatType {
+    MAT_NORMAL,
+    /// both value and grad are shared
+    MAT_NORMAL_SHARED,
+
+    /// Now used in BatchNorm in CPU mode
+    MAT_VALUE_SHARED,
+
+    /// sparse matrix, which has full size parameter
+    MAT_SPARSE_ROW_IDS,
+    /// sparse matrix, parameter size scale by sparse rates.
+    MAT_SPARSE_ROW_AUTO_GROW,
+    MAT_CACHE_ROW,
+    MAT_SPARSE_ROW,
+
+    /// sparse matrix for prefetching parameter from pserver
+    MAT_SPARSE_ROW_PREFETCH,
+    /// same as above, but parameter has full size for saving parameter in local
+    MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
+  };
+
+  void enableSparseParameter() {
+    if (config_.is_sparse()) {
+      if (config_.format() == "csr") {
+        size_t height = config_.dims(0);
+        size_t nnz = config_.size();
+        enableIntType(PARAMETER_ROWS, height + 1);
+        enableIntType(PARAMETER_COLS, nnz);
+        format_ = SPARSE_CSR;
+      } else {
+        size_t width = config_.dims(1);
+        size_t nnz = config_.size();
+        enableIntType(PARAMETER_COLS, width + 1);
+        enableIntType(PARAMETER_ROWS, nnz);
+        format_ = SPARSE_CSC;
+      }
+    }
+  }
+
+  /// allocate buffer for the give type
+  void enableType(ParameterType type, MatType matType = MAT_NORMAL) {
+    if (bufs_[type] || mats_[type]) {
+      return;
+    }
+    SetDevice device(deviceId_);
+    if (config_.dims_size() == 2) {
+      if (matType == MAT_NORMAL || matType == MAT_NORMAL_SHARED ||
+          matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
+          matType == MAT_VALUE_SHARED || matType == MAT_SPARSE_ROW_IDS) {
+        bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+        bufs_[type]->zeroMem();
+      } else {
+        CHECK(isGradSparseUpdate());
+      }
+      if (config_.is_sparse() && type == PARAMETER_VALUE) {
+        enableSparseParameter();
+      }
+      setMat(type, matType);
+    } else {
+      bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+      bufs_[type]->zeroMem();
+    }
+  }
+
+  void enableBufType(ParameterType type) {
+    if (bufs_[type]) return;
+    bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+    bufs_[type]->zeroMem();
+  }
+
+  void enableIntType(ParameterType type, size_t intStoreSize = 0) {
+    if (!intBufs_[type]) {
+      SetDevice device(deviceId_);
+      size_t size = intStoreSize ? intStoreSize : config_.size();
+      intBufs_[type] = IVector::create(size, useGpu_);
+      intBufs_[type]->zeroMem();
+    }
+  }
+
+  void enableSharedType(ParameterType type,
+                        VectorPtr vec,
+                        MatrixPtr mat = nullptr) {
+    if (!bufs_[type] && !mats_[type]) {
+      bufs_[type] = vec;
+      mats_[type] = mat;
+    }
+  }
+
+  /// for batchGradientMachine: blockNum is number of partitions of the matrix.
+  bool isGradShared(size_t* blockNum = NULL);
+
+  bool isValueShared();
+
+  // for AsgdSparseGradientMachine & SgdSparseGradientMachine:
+  // and MultiGradientMachine
+  bool isGradSparseUpdate() const;
+
+  bool isSparseRemoteUpdate() const {
+    return config_.sparse_remote_update() && !useGpu();
+  }
+
+  const ParameterConfig& getConfig() const { return config_; }
+
+  ParameterConfig& getConfig() { return config_; }
+
+  bool hasType(ParameterType pType) const {
+    return bufs_[pType] || mats_[pType];
+  }
+
+  const VectorPtr& getBuf(ParameterType pType) const {
+    return this->bufs_[pType];
+  }
+
+  const VectorPtr* getBufs() const { return bufs_; }
+
+  const MatrixPtr& getMat(ParameterType pType) const { return mats_[pType]; }
+
+  void setValueUpdated() { updated_ = true; }
+
+  void clearValueUpdated() { updated_ = false; }
+
+  bool isValueUpdated() const { return updated_; }
+
+  /**
+   * Save parameter value to a file
+   */
+  bool save(const std::string& filename) const;
+
+  /**
+   * Save parameter to ostream
+   */
+  bool save(std::ostream& s) const;
+
+  /**
+   * Load parameter value from a file
+   */
+  bool load(const std::string& filename);
+
+  /**
+   * Load parameter from istream
+   */
+  bool load(std::istream& is);
+
+  void incShared() { sharedCount_++; }
+
+  /**
+   * After one of the parameter's gradient is merged
+   * You should call this function to do some additional processing,
+   */
+  void incUpdate(const UpdateCallback& callbacks = NULL);
+
+  void clearGradient() {
+    auto& mat = getMat(PARAMETER_GRADIENT);
+    if (mat) {
+      // zeroMem will also clear rows for SparseRowCpuMatrix
+      mat->zeroMem();
+    } else {
+      auto& gradBuf = getBuf(PARAMETER_GRADIENT);
+      if (gradBuf) gradBuf->zeroMem();
+    }
+  }
+
+  void initialize();
+
+  /**
+   * Initialize the value according to config_: initial_mean,
+   * initial_std and initial_strategy.
+   */
+  void randomize();
+  static void randomize(const VectorPtr& value, const ParameterConfig& config);
+
+  /// Initialize the value to 0
+  void zeroMem();
+
+  /// file header structure
+  struct Header {
+    int32_t format;      // = PARAM_FORMAT
+    uint32_t valueSize;  // = sizeof(real)
+    uint64_t size;       // = getSize()
+  };
+
+  /**
+   * @brief Is the header format supported.
+   */
+  static bool isHeaderFormatSupported(int32_t fmt) {
+    return fmt < PARAM_FORMAT_ITEMS;
+  }
+
+  /**
+   * @brief Get the format in header.
+   */
+  int getHeaderFormat() { return headerFormat_; }
+
+  /**
+   * @brief Set the format in header.
+   */
+  void setHeaderFormat(int32_t fmt) {
+    CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
+                                        << fmt;
+    headerFormat_ = fmt;
+  }
+
+  /**
+   * @brief  Parameter Update Hook.
+   *
+   * The parameter's update hook before ParameterUpdater::updateImpl
+   * It could modify gradient/momentum/etc here. Such as drop some gradient,
+   * etc.
+   */
+  void updateHook() {
+    for (auto& hook : updaterHooks_) {
+      hook->update(this);
+    }
+  }
+
+  /**
+   * @brief  Initialize all updater hook.
+   *
+   * This method should be invoked in ParameterUpdater::init() only.
+   */
+  void initHook() {
+    for (auto& hook : updaterHooks_) {
+      hook->init(this);
+    }
+  }
+
+ protected:
+  /**
+   * @brief create matrix to matType.
+   *
+   * used by gradient machine which needs specify matrix type,
+   * instead of creating in weights.cpp.
+   *
+   * @note  pType should be enabled already.
+   */
+  void setMat(ParameterType pType, int matType);
+
+  bool isUpdatable() { return (updateCounter_ == sharedCount_); }
+
+  void clearUpdate() { updateCounter_ = 0; }
+
+ protected:
+  ParameterConfig config_;
+
+  bool useGpu_;
+
+  int deviceId_;
+
+  /**
+   * @brief bufs_ stores parameter value and gradient.
+   *
+   * Layer should use bufs_[PARAMETER_VALUE] to form weight matrix for
+   * calculation and stores gradient to bufs_[PARAMETER_GRADIENT].
+   */
+  VectorPtr bufs_[NUM_PARAMETER_TYPES];
+
+  /**
+   * @brief Weight matrix for bufs_.
+   *
+   * It's helpfull when parameter shared by multi-layers.
+   * Caller should check, if mats exist, do not create it again.
+   */
+  MatrixPtr mats_[NUM_PARAMETER_TYPES];
+
+  /// Int vectors, used in some User defined parameter types
+  IVectorPtr intBufs_[NUM_PARAMETER_TYPES];
+
+  int sharedCount_;
+  int updateCounter_;
+
+  bool updated_;
+  SparseFormat format_;
+
+  /// The header format for saving or loading param
+  int32_t headerFormat_;
+
+  std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
+
+ public:
+  void setSharedCount(int cnt) { sharedCount_ = cnt; }
+  int getSharedCount() { return sharedCount_; }
+
+  bool isSparse() { return config_.is_sparse(); }
+  SparseFormat getFormat() { return format_; }
+
+  static const std::string kMissParameterFail;
+  static const std::string kMissParameterRand;
+  static const std::string kMissParameterZero;
+};
+
+typedef std::map<std::string, ParameterPtr> ParameterMap;
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParameterOptimizer.cpp b/paddle/legacy/parameter/ParameterOptimizer.cpp
similarity index 100%
rename from paddle/parameter/ParameterOptimizer.cpp
rename to paddle/legacy/parameter/ParameterOptimizer.cpp
diff --git a/paddle/parameter/ParameterOptimizer.h b/paddle/legacy/parameter/ParameterOptimizer.h
similarity index 100%
rename from paddle/parameter/ParameterOptimizer.h
rename to paddle/legacy/parameter/ParameterOptimizer.h
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/legacy/parameter/ParameterUpdateFunctions.cpp
similarity index 100%
rename from paddle/parameter/ParameterUpdateFunctions.cpp
rename to paddle/legacy/parameter/ParameterUpdateFunctions.cpp
diff --git a/paddle/legacy/parameter/ParameterUpdateFunctions.h b/paddle/legacy/parameter/ParameterUpdateFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3dbde93b9196a73630f34ef76200933bf4e6dc7e
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterUpdateFunctions.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/utils/Common.h"
+
+namespace paddle {
+
+/**
+ * Performs the following operations.
+ *
+ * momentumVec = momentum * momentumVec
+ *               - learningRate * grad
+ *               - learningRate * decayRate * value
+ *
+ * value = value + momentumVec
+ * momentum = 0 or decayRate = 0 are specially handled to avoid unnecessary
+ * computation.
+ */
+void sgdUpdate(real learningRate,
+               real momentum,
+               real decayRate,
+               Vector* value,
+               Vector* grad,
+               Vector* momentumVec);
+
+void sgdUpdateCpu(real learningRate,
+                  real momentum,
+                  real decayRate,
+                  size_t size,
+                  real* value,
+                  const real* grad,
+                  real* momentumVec);
+
+void sgdUpdateAvx(float learningRate,
+                  float momentum,
+                  float decayRate,
+                  size_t size,
+                  float* value,
+                  const float* grad,
+                  float* momentumVec);
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterBase.cpp b/paddle/legacy/parameter/ParameterUpdaterBase.cpp
similarity index 100%
rename from paddle/parameter/ParameterUpdaterBase.cpp
rename to paddle/legacy/parameter/ParameterUpdaterBase.cpp
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/legacy/parameter/ParameterUpdaterBase.h
similarity index 100%
rename from paddle/parameter/ParameterUpdaterBase.h
rename to paddle/legacy/parameter/ParameterUpdaterBase.h
diff --git a/paddle/legacy/parameter/ParameterUpdaterHook.cpp b/paddle/legacy/parameter/ParameterUpdaterHook.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e4677f894acc7632c2a20c49c0a799101357eea6
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterUpdaterHook.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterUpdaterHook.h"
+
+#include <algorithm>
+#include <atomic>
+#include <fstream>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * The static pruning hook
+ * Static means user specify a sparsity_ratio before training started, and the
+ * network will prune the parameters based on the sparsity_ratio. More details
+ * can be found https://arxiv.org/pdf/1506.02626.pdf.
+ */
+
+class StaticPruningHook : public IParameterUpdaterHook {
+ public:
+  explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig)
+      : initCount_(0) {
+    sparsityRatio_ = hookConfig.sparsity_ratio();
+  }
+
+  static bool sortPairAscend(const std::pair<real, size_t> &pair1,
+                             const std::pair<real, size_t> &pair2) {
+    return pair1.first > pair2.first;
+  }
+
+  void update(Parameter *para) {
+    updateThreadChecker_.check();
+    auto &vec = para->getBuf(PARAMETER_GRADIENT);
+    if (vec) {
+      vec->dotMul(*maskVec_);
+    }
+  }
+
+  void generateMask(Parameter *para) {
+    VectorPtr maskTemp = Vector::create(para->getSize(), false);
+    maskTemp->zeroMem();
+    real *maskTempData = maskTemp->getData();
+    size_t nonZeroNum = para->getSize() * (1 - sparsityRatio_);
+
+    VectorPtr paraVec = para->getBuf(PARAMETER_VALUE);
+    VectorPtr paraCpuCopy = Vector::create(para->getSize(), false);
+
+    paraCpuCopy->copyFrom(*paraVec);
+    std::vector<std::pair<real, size_t>> param;
+
+    for (size_t i = 0; i < para->getSize(); i++)
+      param.push_back(std::make_pair(fabs(paraCpuCopy->getData()[i]), i));
+
+    std::partial_sort(
+        param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend);
+    for (size_t i = 0; i < nonZeroNum; i++) maskTempData[param[i].second] = 1.0;
+
+    // Currently just use a mask vector for hack.
+    if (para->useGpu()) {
+      maskVec_ = Vector::create(para->getSize(), para->useGpu());
+      maskVec_->copyFrom(*maskTemp);
+    } else {
+      maskVec_ = maskTemp;
+    }
+  }
+
+  void init(Parameter *para) {
+    generateMask(para);
+    size_t initCount = this->initCount_.fetch_add(1);
+    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
+                                "in same ParamterUpdater";
+    VLOG(3) << "Initialize Parameter " << para;
+    SetDevice device(para->getDeviceId());
+
+    auto &paraVec = para->getBuf(PARAMETER_VALUE);
+    paraVec->dotMul(*maskVec_);
+  }
+
+ private:
+  SameThreadChecker updateThreadChecker_;
+  std::atomic<size_t> initCount_;
+  VectorPtr maskVec_;
+  real sparsityRatio_;
+};
+
+IParameterUpdaterHook::IParameterUpdaterHook() {}
+
+IParameterUpdaterHook::~IParameterUpdaterHook() {}
+
+/**
+ * A Hasher used by g_hooks.
+ *
+ * Use the independent hasher intendedly. There is a hasher in PServer for hash
+ * ParameterBlock. But not to use same hasher to reduce dependency.
+ *
+ * May be extracted to Util.h to unify the hasher.
+ */
+class StringIntPairHasher {
+ public:
+  size_t operator()(const std::pair<std::string, int> &k) const {
+    return intHasher_(strHasher_(k.first) + k.second);
+  }
+
+ private:
+  std::hash<std::string> strHasher_;
+  std::hash<int> intHasher_;
+};
+
+static WeakKVCache<std::pair<std::string, int>,
+                   IParameterUpdaterHook,
+                   StringIntPairHasher>
+    g_hookCache_;
+
+/**
+ * ParameterUpdaterHook actually factory method.
+ */
+static IParameterUpdaterHook *createImpl(
+    const ParameterUpdaterHookConfig &config) {
+  auto &type = config.type();
+  if (type == "pruning") {
+    return new StaticPruningHook(config);
+  }
+
+  LOG(FATAL) << "Unknown Hook type:  " << type;
+  return nullptr;
+}
+
+std::shared_ptr<IParameterUpdaterHook> IParameterUpdaterHook::create(
+    const ParameterConfig &paramConfig, int idx) {
+  std::pair<std::string, int> key = {paramConfig.name(), idx};
+  return g_hookCache_.get(
+      key, [&] { return createImpl(paramConfig.update_hooks(idx)); });
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterHook.h b/paddle/legacy/parameter/ParameterUpdaterHook.h
similarity index 100%
rename from paddle/parameter/ParameterUpdaterHook.h
rename to paddle/legacy/parameter/ParameterUpdaterHook.h
diff --git a/paddle/parameter/Regularizer.cpp b/paddle/legacy/parameter/Regularizer.cpp
similarity index 100%
rename from paddle/parameter/Regularizer.cpp
rename to paddle/legacy/parameter/Regularizer.cpp
diff --git a/paddle/parameter/Regularizer.h b/paddle/legacy/parameter/Regularizer.h
similarity index 100%
rename from paddle/parameter/Regularizer.h
rename to paddle/legacy/parameter/Regularizer.h
diff --git a/paddle/parameter/ThreadLocalBuffer.cpp b/paddle/legacy/parameter/ThreadLocalBuffer.cpp
similarity index 100%
rename from paddle/parameter/ThreadLocalBuffer.cpp
rename to paddle/legacy/parameter/ThreadLocalBuffer.cpp
diff --git a/paddle/legacy/parameter/ThreadLocalBuffer.h b/paddle/legacy/parameter/ThreadLocalBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d360feeed6c98ee60e3bdae924434054080576b0
--- /dev/null
+++ b/paddle/legacy/parameter/ThreadLocalBuffer.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+namespace parameter {
+extern VectorPtr* getThreadLocalBuffer();
+}  // namespace parameter
+}  // namespace paddle
diff --git a/paddle/parameter/Weight.cpp b/paddle/legacy/parameter/Weight.cpp
similarity index 100%
rename from paddle/parameter/Weight.cpp
rename to paddle/legacy/parameter/Weight.cpp
diff --git a/paddle/legacy/parameter/Weight.h b/paddle/legacy/parameter/Weight.h
new file mode 100644
index 0000000000000000000000000000000000000000..241c8d829cd0c7b57964324d3378bdfcf09e6a70
--- /dev/null
+++ b/paddle/legacy/parameter/Weight.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory>
+#include <vector>
+
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseRowMatrix.h"
+#include "paddle/legacy/parameter/Parameter.h"
+
+namespace paddle {
+
+class Weight {
+ private:
+  MatrixPtr weight_;
+  MatrixPtr weightGrad_;
+  ParameterPtr parameter_;
+
+ public:
+  Weight(size_t height, size_t width, ParameterPtr parameter);
+  Weight(size_t height, size_t width, ParameterPtr parameter, size_t offset);
+
+  const MatrixPtr& getW() { return weight_; }
+  const MatrixPtr& getWGrad() { return weightGrad_; }
+  const ParameterPtr& getParameterPtr();
+
+  void incUpdate(const UpdateCallback& callback) {
+    getParameterPtr()->incUpdate(callback);
+  }
+
+  void setParameterPtr(ParameterPtr param);
+};
+
+typedef std::vector<std::unique_ptr<Weight>> WeightList;
+
+}  // namespace paddle
diff --git a/paddle/parameter/tests/CMakeLists.txt b/paddle/legacy/parameter/tests/CMakeLists.txt
similarity index 100%
rename from paddle/parameter/tests/CMakeLists.txt
rename to paddle/legacy/parameter/tests/CMakeLists.txt
diff --git a/paddle/legacy/parameter/tests/test_argument.cpp b/paddle/legacy/parameter/tests/test_argument.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c632e0cd10342431dfcada680a18d8f9eabeb9c
--- /dev/null
+++ b/paddle/legacy/parameter/tests/test_argument.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/parameter/Argument.h>
+
+using namespace paddle;  // NOLINT
+
+TEST(Argument, poolSequenceWithStride) {
+  Argument input, output;
+  ICpuGpuVector::resizeOrCreate(input.sequenceStartPositions, 5, false);
+  int* inStart = input.sequenceStartPositions->getMutableData(false);
+  inStart[0] = 0;
+  inStart[1] = 9;
+  inStart[2] = 14;
+  inStart[3] = 17;
+  inStart[4] = 30;
+
+  int strideResult[] = {0, 5, 9, 14, 17, 22, 27, 30};
+  int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
+
+  for (auto reversed : {false, true}) {
+    ICpuGpuVectorPtr stridePositions;
+    output.poolSequenceWithStride(
+        input, 5 /* stride */, &stridePositions, reversed);
+
+    const int* outStart = output.sequenceStartPositions->getData(false);
+    CHECK_EQ(outStart[0], 0);
+    CHECK_EQ(outStart[1], 2);
+    CHECK_EQ(outStart[2], 3);
+    CHECK_EQ(outStart[3], 4);
+    CHECK_EQ(outStart[4], 7);
+
+    CHECK_EQ(stridePositions->getSize(), 8UL);
+    auto result = reversed ? strideResultReversed : strideResult;
+    for (int i = 0; i < 8; i++) {
+      CHECK_EQ(stridePositions->getData(false)[i], result[i]);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/parameter/tests/test_common.cpp b/paddle/legacy/parameter/tests/test_common.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3c4ee11934b0dd487517b3799611c8c1a153f52d
--- /dev/null
+++ b/paddle/legacy/parameter/tests/test_common.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/utils/Util.h>
+#include <stdlib.h>
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/parameter/ParameterUpdateFunctions.h>
+#include <paddle/utils/Flags.h>
+#include <paddle/utils/Stat.h>
+#include <paddle/utils/Thread.h>
+
+using namespace paddle;  // NOLINT
+
+class CommonTest : public ::testing::Test {
+ protected:
+  CommonTest() : testStat_("test") {}
+  virtual ~CommonTest() {}
+  virtual void SetUp() {
+    const size_t buffSize[] = {
+        100, 128, 500, 1024, 4096, 10240, 102400, 1000000};
+    sizeVec_.resize(8);
+    memcpy(&sizeVec_[0], &buffSize[0], 8 * sizeof(size_t));
+    valueUint_.resize(4);
+    valueUint_[0].first = 0.0;
+    valueUint_[0].second = 0.0;
+    valueUint_[1].first = 0.0;
+    valueUint_[1].second = 1.0;
+    valueUint_[2].first = 1.0;
+    valueUint_[2].second = 0.0;
+    valueUint_[3].first = 1.0;
+    valueUint_[3].second = 1.0;
+    learningRate_ = 1.0;
+  }
+
+  void test_sgdUpadate(real* gradientBuffer,
+                       real* valueBuffer,
+                       real* momentumBuffer,
+                       size_t size);
+
+  virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
+
+ protected:
+  std::vector<std::pair<real, real>> valueUint_;
+  std::vector<size_t> sizeVec_;
+  real learningRate_;
+  StatSet testStat_;
+};
+
+void CommonTest::test_sgdUpadate(real* gradientBuffer,
+                                 real* valueBuffer,
+                                 real* momentumBuffer,
+                                 size_t size) {
+// sgdUpdateAvx has no double version yet
+#if defined(__AVX__) && !defined(PADDLE_TYPE_DOUBLE)
+  real valueSum1 = 0, valueSum2 = 0, momSum1 = 0, momSum2 = 0;
+  real* gradTmp = new real[size];
+  real* valueTmp = new real[size];
+  real* momentumTmp = new real[size];
+  memcpy(gradTmp, gradientBuffer, size * sizeof(real));
+  memcpy(valueTmp, valueBuffer, size * sizeof(real));
+  memcpy(momentumTmp, momentumBuffer, size * sizeof(real));
+  for (auto& arg : valueUint_) {
+    {
+      {
+        struct timeval t;
+        REGISTER_TIMER("gettimeofday", 0, testStat_);
+        gettimeofday(&t, NULL);
+      }
+      REGISTER_TIMER("avxTimer", 0);
+      sgdUpdateAvx(learningRate_,
+                   arg.first,
+                   arg.second,
+                   size,
+                   valueBuffer,
+                   gradientBuffer,
+                   momentumBuffer);
+    }
+    for (size_t i = 0; i < size; i++) {
+      valueSum1 += valueBuffer[i];
+      momSum1 += momentumBuffer[i];
+      // std::cout << "["
+      //          << valueBuffer[i]
+      //          << "," << momentumBuffer[i]
+      //          << "," << gradientBuffer[i] << "],";
+    }
+    {
+      REGISTER_TIMER("cpuTimer", 0);
+      sgdUpdateCpu(learningRate_,
+                   arg.first,
+                   arg.second,
+                   size,
+                   valueTmp,
+                   gradTmp,
+                   momentumTmp);
+    }
+    for (size_t i = 0; i < size; i++) {
+      valueSum2 += valueTmp[i];
+      momSum2 += momentumTmp[i];
+      // std::cout << "["
+      //          << valueTmp[i]
+      //          << "," << momentumTmp[i]
+      //          << "," << gradTmp[i] << "],";
+    }
+
+    VLOG(3) << "valueSum1 = " << valueSum1 << " ; valueSum2 = " << valueSum2;
+    VLOG(3) << "momSum1 = " << momSum1 << " ; momSum2 = " << momSum2;
+    ASSERT_EQ(valueSum1, valueSum2);
+    ASSERT_EQ(momSum1, momSum2);
+  }
+  delete[] gradTmp;
+  delete[] valueTmp;
+  delete[] momentumTmp;
+#endif
+}
+
+TEST_F(CommonTest, sgdUpdate) {
+  const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
+  for (auto& size : sizeVec_) {
+    real *gradientBuffer, *valueBuffer, *momentumBuffer;
+    CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
+             0);
+    CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
+    CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
+             0);
+
+    for (size_t i = 0; i < size; i++) {
+      gradientBuffer[i] = 1.0;
+      valueBuffer[i] = 2.0;
+      momentumBuffer[i] = 3.0;
+    }
+    for (int i = 0; i < 6; i++) {
+      LOG(INFO) << "----------------------" << size << ":" << alignHeader[i]
+                << "-------------------------";
+      test_sgdUpadate(&gradientBuffer[alignHeader[i]],
+                      &valueBuffer[alignHeader[i]],
+                      &momentumBuffer[alignHeader[i]],
+                      size - alignHeader[i]);
+    }
+    free(gradientBuffer);
+    free(valueBuffer);
+    free(momentumBuffer);
+  }
+  globalStat.printAllStatus();
+  testStat_.printAllStatus();
+}
+
+TEST_F(CommonTest, syncThreadPool) {
+  SyncThreadPool pool(10);
+
+  std::vector<int> nums;
+  nums.resize(10);
+
+  pool.exec([&](int tid, size_t numThreads) { nums[tid] = tid; });
+  for (size_t i = 0; i < nums.size(); ++i) {
+    EXPECT_EQ((int)i, nums[i]);
+  }
+
+  pool.exec([&](int tid, size_t numThreads) { nums[tid] -= tid; });
+  for (size_t i = 0; i < nums.size(); ++i) {
+    EXPECT_EQ((int)0, nums[i]);
+  }
+}
diff --git a/paddle/pserver/BaseClient.cpp b/paddle/legacy/pserver/BaseClient.cpp
similarity index 100%
rename from paddle/pserver/BaseClient.cpp
rename to paddle/legacy/pserver/BaseClient.cpp
diff --git a/paddle/legacy/pserver/BaseClient.h b/paddle/legacy/pserver/BaseClient.h
new file mode 100644
index 0000000000000000000000000000000000000000..92bb0a8b6a1ac896b8a281601e407a729556d5f0
--- /dev/null
+++ b/paddle/legacy/pserver/BaseClient.h
@@ -0,0 +1,311 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ParameterService.pb.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/pserver/ProtoServer.h"
+#include "paddle/utils/Common.h"
+#include "paddle/utils/Queue.h"
+
+namespace paddle {
+
+/**
+ * it manages all connections to pservers.
+ * it exists two modes to manage connections to all pservers. Firstly, one
+ * connection owns two threads that separately manage to send and receive
+ * data. Secondly, each thread uses one connection for all activation in it.
+ * the first solution arms with sendThreads_/recvThreads_ and sendJobQueue_/
+ * recvJobQueue_. the second solution use some shared thread pool to manage
+ * connections.
+ */
+class BaseClient {
+ protected:
+  typedef std::unique_ptr<std::thread> ThreadPtr;
+  typedef std::vector<std::vector<iovec>> InputIovs;
+  typedef std::vector<SendParameterRequest> SendRequest;
+  typedef std::vector<SendDataRequest> SendDataRequestVec;
+
+  // TODO(yanfei):
+  // refine data structure to unify parameter and features communication
+  struct SendJob {
+    /// store parameters related blocks data
+    InputIovs parallelInputIovs;
+    /// store protobuf request
+    SendRequest parallelRequests;
+    /// store data, such as features for metric learning
+    SendDataRequestVec parallelDataRequests;
+  };
+
+ public:
+  explicit BaseClient(bool separate = false, int numPorts = FLAGS_ports_num);
+
+  virtual ~BaseClient();
+
+  typedef std::shared_ptr<SendJob> SendJobPtr;
+  typedef Queue<SendJobPtr> SendQueue;
+
+  /// send data to server, support only synchronize
+  template <class DataType>
+  void putData(int clientId,
+               SendDataType type,
+               DataType* datas,
+               size_t size,
+               DataUpdateMode mode) {
+    synchronize(SYNC_DATA);
+    sendData(clientId, type, mode, datas, size);
+    recvData();
+    synchronize(SYNC_DATA);
+  }
+
+  template <class DataType>
+  void putOwnData(int clientId,
+                  SendDataType type,
+                  DataType* datas,
+                  size_t size) {
+    putData(clientId, type, datas, size, DATA_UPDATE_MODE_SET_OWN);
+  }
+
+  template <class DataType>
+  void getAllData(int clientId,
+                  SendDataType type,
+                  DataType* datas,
+                  size_t size) {
+    sendData(clientId,
+             type,
+             DATA_UPDATE_MODE_GET_ALL,
+             reinterpret_cast<DataType*>(NULL),
+             0);
+    recvData();
+    size_t dataOffset = 0;
+    for (auto& recvMem : recvDataMems_) {
+      CHECK_LE(dataOffset, size);
+      size_t memSize = std::min(recvMem.get()->getSize(),
+                                sizeof(DataType) * (size - dataOffset));
+      CHECK_EQ(memSize % sizeof(DataType), size_t(0));
+      memcpy(datas + dataOffset, recvMem.get()->getBuf(), memSize);
+      dataOffset += memSize / sizeof(DataType);
+    }
+    CHECK_EQ(dataOffset, size);
+  }
+
+  /**
+   * Reduces values on all clients.
+   * This reduce just support SUM.
+   * The results are saved in recvBuf of rootId client
+   */
+  template <class DataType>
+  void reduce(DataType* sendBuf,
+              DataType* recvBuf,
+              size_t size,
+              int clientId,
+              int rootId) {
+    putOwnData(clientId, DATA_REDUCE_SUM, sendBuf, size);
+    if (rootId == clientId) {
+      getAllData(clientId, DATA_REDUCE_SUM, recvBuf, size);
+    }
+  }
+
+  /**
+   * return trans data type according to the input type
+   */
+  virtual TransDataType getTransDtype(const std::type_info& info) {
+    TransDataType dataType;
+    if (typeid(int*) == info) {  // NOLINT
+      dataType = TRANS_INT32;
+    } else if (typeid(uint32_t*) == info) {  // NOLINT
+      dataType = TRANS_UINT32_T;
+    } else if (typeid(int64_t*) == info) {  // NOLINT
+      dataType = TRANS_INT64_T;
+    } else if (typeid(uint64_t*) == info) {  // NOLINT
+      dataType = TRANS_UINT64_T;
+    } else if (typeid(float*) == info) {  // NOLINT
+      dataType = TRANS_FLOAT;
+    } else if (typeid(double*) == info) {  // NOLINT
+      dataType = TRANS_DOUBLE;
+    } else {
+      LOG(FATAL) << "not supported";
+    }
+    return dataType;
+  }
+
+ protected:
+  /// for a > 0, b > 0:
+  /// return the smallest x s.t. b*x >= a
+  static int divup(int a, int b) { return (a + b - 1) / b; }
+
+  int calcClientId(int i, int serviceNum) {
+    return (i + FLAGS_trainer_id * numPorts_) % serviceNum;
+  }
+
+  /// start threads in sendThreads_ and recvThreads_
+  void startThreads();
+
+  /// finish threads in sendThreads_ and recvThreads_
+  void finishThreads();
+
+  template <class DataType>
+  void prepareData(int clientId,
+                   SendDataType type,
+                   DataUpdateMode updateMode,
+                   DataType* datas,
+                   size_t size,
+                   SendJob* sendJob) {
+    sendJob->parallelDataRequests.resize(serviceNum_);
+    sendJob->parallelInputIovs.resize(serviceNum_);
+    for (int i = 0; i < serviceNum_; ++i) {
+      auto& request = sendJob->parallelDataRequests[i];
+      request.set_update_mode(updateMode);
+      request.set_type(type);
+      request.set_client_id(clientId);
+      request.set_server_id(i);
+    }
+
+    /// split datas which need send to Server into serviceNum_ pieces
+    if (!datas) {
+      CHECK(!size) << "ownSize should be zero since datas is nullptr";
+    }
+    size_t baseSize = size / serviceNum_;
+    size_t dataOffset = 0;
+    for (int i = 0; i < serviceNum_; ++i) {
+      auto& request = sendJob->parallelDataRequests[i];
+      DataBlock* block = request.add_blocks();
+      size_t ownSize = size_t(i) < size % serviceNum_ ? baseSize + 1 : baseSize;
+      size_t realSize = datas ? std::max(ownSize, size_t(1)) : 0;
+      block->set_total_size(realSize * sizeof(DataType));
+      block->set_data_size(sizeof(DataType));
+      // TODO(yuyang18): The getTransDtype can be rewritten as template method
+      //                 to reduce runtime overhead.
+      block->set_data_type(getTransDtype(typeid(DataType*)));  // NOLINT
+      if (datas) {
+        sendJob->parallelInputIovs[i].push_back(
+            {datas + dataOffset, realSize * sizeof(DataType)});
+      }
+      dataOffset += ownSize;
+    }
+    CHECK_EQ(dataOffset, size);
+  }
+
+  /**
+   * @brief send data to all data servers
+   *
+   * @note  each trainer sends all its data to all data servers
+   *        it's for broadcast data synchronization, such as features
+   *        synchronization in metric learning.
+   */
+  template <class DataType>
+  void sendData(int clientId,
+                SendDataType type,
+                DataUpdateMode updateMode,
+                DataType* datas,
+                size_t size) {
+    SendJobPtr sendJob = std::make_shared<SendJob>();
+    prepareData(clientId, type, updateMode, datas, size, sendJob.get());
+    for (int i = 0; i < threadNum_; ++i) {
+      sendJobQueue_[i]->enqueue(sendJob);
+    }
+  }
+
+  /**
+   * @brief recv data from all data servers
+   *
+   * @note  synchronize all recv threads
+   */
+  void recvData();
+
+  /// send request, and recv responses
+  template <typename ProtoIn, typename ProtoOut>
+  void multiCall(const char* funcName,
+                 const ProtoIn& request,
+                 std::vector<ProtoOut>* responses) {
+    responses->resize(clients_.size());
+    size_t numClients = clients_.size();
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].send(funcName, request);
+    }
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].recv(&(*responses)[i]);
+    }
+  }
+
+  /**
+   * @brief synchronize all trainers and pservers
+   *
+   * @note  used to ensure that data of all trainers have been received
+   */
+  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
+
+  /**
+   * @brief use multithread to separately send data
+   *
+   * @note  each thread should read its own JobQueue to handle requests
+   *        each thread should calcClientId() to retrieve connections
+   *        managed by himself.
+   *        send and recv are implemented in child class.
+   */
+  virtual void send(int threadId) = 0;
+
+  /**
+   * @brief use multithread to separately receive data
+   *
+   * @note  almost same as send()
+   */
+  virtual void recv(int threadId) = 0;
+
+ protected:
+  bool stopping_;
+  /// nodes * ports that means the number of real pservers
+  int serviceNum_;
+  /**
+   * threads num for managing all services. Normally the
+   * number of pservers are relatively less than several
+   * hundreds so that using thread-based parallelization
+   * can benifit traffic performance and pserver's sgd
+   * optimization performance.
+   */
+  int threadNum_;
+  /// the connection manager at client end
+  std::vector<ProtoClient> clients_;
+  /// send threads for parallelization
+  std::vector<ThreadPtr> sendThreads_;
+  /// recv threads for parallelization
+  std::vector<ThreadPtr> recvThreads_;
+  std::unique_ptr<ThreadBarrier> recvSyncBarrier_;
+
+  // TODO(yanfei):
+  // current pserver's will return value until all parameters'
+  // optimization are finished so that recv are not overlapped
+  // in reality. More robust implimentation should be to pipeline
+  // all send/recv action based on parameter unit level, and
+  // it will benifits deep and larger model training in future,
+  // especially local node compution power surpasses inter-connection
+  // such as GPU cluster, even with BOX GPU cluster.
+  // queue for buffering send request
+  /**
+   * send/recv queue cooperates with each other to accomplish
+   * overlapping communication with forwardBackward action.
+   */
+  std::vector<std::unique_ptr<SendQueue>> sendJobQueue_;
+  /// queue for buffering recv request
+  std::vector<std::unique_ptr<SendQueue>> recvJobQueue_;
+  /// specific for dserver
+  SendJob sendJob_;
+  /// port num for each node
+  int numPorts_;
+  /// if set, overlapped optimization is disabled
+  bool separateSendAndRecv_;
+  std::vector<CpuMemHandlePtr> recvDataMems_;
+};
+}  // namespace paddle
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/legacy/pserver/CMakeLists.txt
similarity index 100%
rename from paddle/pserver/CMakeLists.txt
rename to paddle/legacy/pserver/CMakeLists.txt
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/legacy/pserver/LightNetwork.cpp
similarity index 100%
rename from paddle/pserver/LightNetwork.cpp
rename to paddle/legacy/pserver/LightNetwork.cpp
diff --git a/paddle/pserver/LightNetwork.h b/paddle/legacy/pserver/LightNetwork.h
similarity index 100%
rename from paddle/pserver/LightNetwork.h
rename to paddle/legacy/pserver/LightNetwork.h
diff --git a/paddle/legacy/pserver/ParameterClient2.cpp b/paddle/legacy/pserver/ParameterClient2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..98b3966250c60ecba7a48320c98f47c590ceb95c
--- /dev/null
+++ b/paddle/legacy/pserver/ParameterClient2.cpp
@@ -0,0 +1,781 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+
+#include "ParameterClient2.h"
+#include "paddle/legacy/math/SparseRowMatrix.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/StringUtil.h"
+
+DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
+DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
+
+namespace paddle {
+
+template <typename T1, typename T2>
+void copyToRepeatedField(google::protobuf::RepeatedField<T1>* dest,
+                         const T2* src,
+                         size_t size) {
+  dest->Clear();
+  dest->Reserve(size);
+  for (size_t i = 0; i < size; ++i) {
+    dest->AddAlreadyReserved(src[i]);
+  }
+}
+
+ParameterClient2::ParameterClient2(bool separate, int port, int numPorts)
+    : BaseClient(separate, numPorts), port_(port) {
+#ifndef PADDLE_DISABLE_TIMER
+  forwardbackwordTime_ = 0;
+#endif
+}
+
+int ParameterClient2::calcParameterBlockSize(
+    const std::vector<ParameterPtr>& parameters, size_t serviceNum) {
+  size_t totalSize = 0;
+  for (auto& para : parameters) {
+    totalSize += para->getSize();
+  }
+  size_t perServerSize = totalSize / serviceNum;
+
+  int sizeBits = 64 - __builtin_clzl(perServerSize);
+
+  /// 2^10 is min block size
+  /// 2^7 will be max number of blocks in one pserver
+  int blockSizeBits = std::max((sizeBits - 7), 10);
+  return 1 << blockSizeBits;
+}
+
+void ParameterClient2::initThreads() {
+  threadNum_ = serviceNum_;
+  if (FLAGS_parallel_thread_num > 1) {
+    LOG(INFO) << "parallel_thread_num dosent need to set";
+  }
+  syncThreadPool_.reset(new SyncThreadPool(threadNum_));
+  startThreads();
+}
+
+bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
+  destroy();
+
+  std::vector<std::string> hosts;
+  str::split(FLAGS_pservers, ',', &hosts);
+  serviceNum_ = hosts.size() * numPorts_;
+  uint64_t denseBlockSize = calcParameterBlockSize(parameters, serviceNum_);
+
+  /// setup prefetch matrix if exists
+  for (auto& para : parameters) {
+    /// set block size for each parameter
+    para->getConfig().set_parameter_block_size(
+        para->getConfig().sparse_remote_update() ? para->getConfig().dims(1)
+                                                 : denseBlockSize);
+  }
+
+  for (auto& para : parameters) {
+    CHECK_NE(-1UL, para->getID()) << "id in parameter is not initialized";
+    parameterMap_[para->getID()] = para;
+  }
+
+  allSegments_.reserve(parameters.size());
+
+  for (auto& para : parameters) {
+    ParameterSegments segments;
+    segments.name = para->getName();
+    segments.id = para->getID();
+    allSegments_.push_back(segments);
+    if (para->getConfig().sparse_remote_update()) {
+      CHECK_EQ(para->getConfig().parameter_block_size(),
+               para->getConfig().dims(1))
+          << "For sparse remote update parameter,"
+          << " block size is the width of each row.";
+    }
+  }
+
+  /// init clients
+  clients_.reserve(serviceNum_);
+  recvDataMems_.resize(serviceNum_);
+
+  for (size_t i = 0; i < hosts.size(); ++i) {
+    for (int j = 0; j < numPorts_; ++j) {
+      LOG(INFO) << "pserver " << i * numPorts_ + j << " " << hosts[i] << ":"
+                << port_ + j;
+      if (FLAGS_rdma_tcp == "rdma") {
+        clients_.emplace_back(hosts[i], port_ + j, F_RDMA);
+      } else {
+        clients_.emplace_back(hosts[i], port_ + j, F_TCP);
+      }
+    }
+  }
+
+  sparseDistribution_.reset(new SparseParameterDistribution(serviceNum_));
+
+  sleep(2);
+
+  initThreads();
+
+  return true;
+}
+
+ParameterClient2::~ParameterClient2() { destroy(); }
+
+void ParameterClient2::destroy() {
+  if (clients_.empty()) {
+    /// this means not initialized.
+    return;
+  }
+  finishThreads();
+
+  parameterMap_.clear();
+  allSegments_.clear();
+  clients_.clear();
+}
+
+void ParameterClient2::sendParallel(int tid,
+                                    size_t numThreads,
+                                    ParameterType recvParameterType) {
+  int numMyClients = divup(serviceNum_ - tid, numThreads);
+
+  for (int j = 0; j < numMyClients; ++j) {
+    REGISTER_TIMER("client_sendAndRecv_send");
+    int i = numThreads * j + tid;
+    /// Try to make different clients to send data to different pservers
+    /// at the same time so that they will not flood data to the same
+    /// pserver.
+    i = calcClientId(i, serviceNum_);
+    clients_[i].send("sendParameter",
+                     sendJob_.parallelRequests[i],
+                     sendJob_.parallelInputIovs[i]);
+
+    /// clear large structure
+    sendJob_.parallelRequests[i].Clear();
+    sendJob_.parallelInputIovs[i].clear();
+  }
+
+  std::vector<void*> bufs;
+  SendParameterResponse response;
+  for (int j = 0; j < numMyClients; ++j) {
+    REGISTER_TIMER("client_sendAndRecv_recv");
+    int i = numThreads * j + tid;
+    i = calcClientId(i, serviceNum_);
+    auto msgReader = clients_[i].recv(&response);
+    CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
+    bufs.clear();
+    bufs.reserve(response.blocks_size());
+    for (auto& block : response.blocks()) {
+      auto it = parameterMap_.find(block.para_id());
+      CHECK(it != parameterMap_.end());
+      Parameter* parameter = it->second.get();
+      real* buf = nullptr;
+      if (parameter->getBuf(recvParameterType)) {
+        buf = parameter->getBuf(recvParameterType)->getPoint(block.begin_pos());
+      } else {
+        auto recvMat = dynamic_cast<SparseRowCpuMatrix*>(
+            parameter->getMat(recvParameterType).get());
+        CHECK(recvMat);
+        size_t width = parameter->getConfig().dims(1);
+        // TODO(wuyi): need add lock here? may also cause resize.
+        buf = recvMat->getLocalRow(block.begin_pos() / width);
+      }
+      /// sparse_id is not useful while receiving data since sparse data
+      /// storage is continuous, do commit recieved data as that of dense.
+      bufs.push_back(buf);
+    }
+    msgReader->readBlocks(bufs);
+  }
+}
+
+void ParameterClient2::prepareSendData(
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    ParameterType sendBackParameterType,
+    BatchStatus batchStatus,
+    SendJob* sendJob) {
+  sendJob->parallelRequests.resize(serviceNum_);
+  sendJob->parallelInputIovs.resize(serviceNum_);
+
+  for (auto& request : sendJob->parallelRequests) {
+#ifndef PADDLE_DISABLE_TIMER
+    if (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT) {
+      request.set_forwardbackward_time(forwardbackwordTime_);
+    }
+#endif
+    request.set_trainer_id(trainerId_);
+    request.set_update_mode(updateMode);
+    request.set_send_back_parameter(sendBackParameter);
+    request.set_send_back_parameter_type(sendBackParameterType);
+    request.set_num_samples(numSamples);
+    request.set_cost(cost);
+    request.set_batch_status(batchStatus);
+    CHECK_EQ(request.blocks_size(), 0);
+    VLOG(10) << "request: trainer_id: " << request.trainer_id()
+             << " update_mode" << request.update_mode()
+             << " send_back_parameter: " << request.send_back_parameter()
+             << " send_back_parameter_type: "
+             << request.send_back_parameter_type()
+             << " num_samples: " << request.num_samples()
+             << " cost: " << request.cost()
+             << " batch_status: " << request.batch_status();
+  }
+  for (const auto& segments : parameterSegments) {
+    const auto it = parameterMap_.find(segments.id);
+    CHECK(it != parameterMap_.end());
+    Parameter* parameter = it->second.get();
+    CHECK(parameter != nullptr) << "parameter is nullptr";
+    int64_t nameHash = std::hash<std::string>()(segments.name);
+    bool sendingPara = !(updateMode == PSERVER_UPDATE_MODE_GET_PARAM ||
+                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE ||
+                         updateMode == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
+    bool sparseUpdate = parameter->getConfig().sparse_remote_update() &&
+                        (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT ||
+                         updateMode == PSERVER_UPDATE_MODE_ASYNC_SGD ||
+                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE);
+
+    const auto blockSize = parameter->getConfig().parameter_block_size();
+    CHECK_GE(blockSize, 1LU) << "blockSize should > 0 " << blockSize;
+    const auto paraSize = parameter->getSize();
+    if (sparseUpdate) {
+      auto prefetchMat = std::dynamic_pointer_cast<SparsePrefetchRowCpuMatrix>(
+          parameter->getMat(PARAMETER_VALUE));
+      CHECK(prefetchMat != nullptr) << "prefetchMat is nullptr";
+      auto sendMat = dynamic_cast<SparseRowCpuMatrix*>(
+          parameter->getMat(parameterType).get());
+      CHECK(sendMat != nullptr) << "sendMat is nullptr";
+
+      syncThreadPool_->exec([&](int tid, size_t numThreads) {
+        std::lock_guard<std::mutex> guard(sparseAutoGrowthMutex_);
+        const auto& localIndices = prefetchMat->getLocalIndices();
+        /// num of sparse rows
+        size_t nLocalBlocks = localIndices.size();
+        uint64_t beginDim = 0;
+        uint64_t endDim = 0;
+
+        // HACK(typhoonzero): let it resize first
+        prefetchMat->getLocalRow(nLocalBlocks);
+        sendMat->getLocalRow(nLocalBlocks);
+
+        for (size_t row = 0; row < nLocalBlocks; ++row) {
+          int64_t blockId = localIndices[row];  // local row -> sparse row
+          int serverId = std::abs((blockId + nameHash) % serviceNum_);
+          if (serverId % numThreads != (size_t)tid) {
+            continue;
+          }
+
+          beginDim = blockId * blockSize;
+          endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
+
+          auto& request = sendJob->parallelRequests[serverId];
+          ParameterBlock* block = request.add_blocks();
+          block->set_para_id(segments.id);
+          /// global sparse row id
+          block->set_block_id(blockId);
+          /// local row offset
+          block->set_begin_pos(row * blockSize);
+          /// block len
+          block->set_block_size(endDim - beginDim);
+          if (sendingPara) {
+            sendJob->parallelInputIovs[serverId].push_back(
+                {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
+            /// detect sparse parameter distribution
+            sparseDistribution_->probeDistribution(serverId,
+                                                   sizeof(real) * blockSize);
+          }
+        }
+      });
+
+    } else {  /// parameter set for dense and sparse
+      real* buf =
+          sendingPara ? parameter->getBuf(parameterType)->getPoint(0) : nullptr;
+      uint64_t endDim = 0;
+      for (uint64_t beginDim = 0; beginDim < paraSize; beginDim = endDim) {
+        endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
+        int64_t blockId = beginDim / blockSize;
+        int serverId = std::abs((blockId + nameHash) % serviceNum_);
+
+        auto& request = sendJob->parallelRequests[serverId];
+        ParameterBlock* block = request.add_blocks();
+        block->set_para_id(segments.id);
+        block->set_block_id(blockId);
+        block->set_begin_pos(beginDim);
+        block->set_block_size(endDim - beginDim);
+        if (buf) {
+          sendJob->parallelInputIovs[serverId].push_back(
+              {buf + beginDim, sizeof(real) * ((size_t)(endDim - beginDim))});
+        }
+      }
+    }
+  }  // parameterSegments
+
+  sparseDistribution_->checkAndResetDistribution();
+}
+
+void ParameterClient2::sendAndReceiveParameter(
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    ParameterType sendBackParameterType,
+    ParameterType recvParameterType) {
+  prepareSendData(updateMode,
+                  parameterType,
+                  parameterSegments,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  sendBackParameterType,
+                  /*batchStatus = */ BATCH_START_AND_FINISH,
+                  &sendJob_);
+
+  syncThreadPool_->exec([&](int tid, size_t numThreads) {
+    this->sendParallel(tid, numThreads, recvParameterType);
+  });
+}
+
+void ParameterClient2::sendParameter(
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    BatchStatus batchStatus) {
+  SendJobPtr sendJob = std::make_shared<SendJob>();
+  prepareSendData(updateMode,
+                  parameterType,
+                  parameterSegments,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  PARAMETER_VALUE,
+                  batchStatus,
+                  sendJob.get());
+
+  for (int i = 0; i < threadNum_; i++) {
+    sendJobQueue_[i]->enqueue(sendJob);
+  }
+}
+
+void ParameterClient2::recvParameter() { recvSyncBarrier_->wait(); }
+
+void ParameterClient2::send(int threadId) {
+  int index = threadId;
+  LOG(INFO) << "send thread " << threadId << " started";
+  int numMyClients = divup(serviceNum_ - index, threadNum_);
+  while (true) {
+    SendJobPtr recvJob = sendJobQueue_[index]->dequeue();
+    if (stopping_) {
+      recvJobQueue_[index]->enqueue(recvJob);
+      break;
+    }
+    for (int j = 0; j < numMyClients; ++j) {
+      REGISTER_TIMER("client_send");
+      int i = threadNum_ * j + index;
+      /// Try to make different clients to send data to different pservers
+      /// at the same time so that they will not flood data to the same
+      /// pserver.
+      i = calcClientId(i, serviceNum_);
+      if (recvJob->parallelRequests.size()) {
+        clients_[i].send("sendParameter",
+                         recvJob->parallelRequests[i],
+                         recvJob->parallelInputIovs[i]);
+      } else {
+        clients_[i].send("sendData",
+                         recvJob->parallelDataRequests[i],
+                         recvJob->parallelInputIovs[i]);
+      }
+    }
+    recvJobQueue_[index]->enqueue(recvJob);
+  }
+}
+
+void ParameterClient2::recv(int threadId) {
+  LOG(INFO) << "recv thread " << threadId << " started";
+  int index = threadId;
+  int numMyClients = divup(serviceNum_ - index, threadNum_);
+  while (true) {
+    std::vector<void*> bufs;
+    SendParameterResponse response;
+    SendDataResponse dataResponse;
+    SendJobPtr recvJob = recvJobQueue_[index]->dequeue();
+    if (stopping_) break;
+    for (int j = 0; j < numMyClients; ++j) {
+      REGISTER_TIMER("client_recv");
+      int i = threadNum_ * j + index;
+      i = calcClientId(i, serviceNum_);
+      if (recvJob->parallelRequests.size()) {
+        auto msgReader = clients_[i].recv(&response);
+        CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
+        bufs.clear();
+        bufs.reserve(response.blocks_size());
+        for (auto& block : response.blocks()) {
+          auto it = parameterMap_.find(block.para_id());
+          CHECK(it != parameterMap_.end());
+          Parameter* parameter = it->second.get();
+          real* buf =
+              parameter->getBuf(PARAMETER_VALUE)->getPoint(block.begin_pos());
+          CHECK_EQ(msgReader->getBlockLength(bufs.size()),
+                   sizeof(real) * (block.block_size()));
+          bufs.push_back(buf);
+        }
+        msgReader->readBlocks(bufs);
+      } else {
+        auto msgReader = clients_[i].recv(&dataResponse);
+        CHECK_EQ(msgReader->getNumBlocks(), (size_t)dataResponse.blocks_size());
+        size_t totalLen = msgReader->getTotalLength();
+        if (0 == totalLen) {
+          continue;
+        }
+        auto& recvMem = recvDataMems_[dataResponse.server_id()];
+        CHECK_EQ(dataResponse.blocks_size(), 1)
+            << "Only one block currently support now!";
+        auto& block = dataResponse.blocks(0);
+        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
+        recvMem = std::make_shared<CpuMemoryHandle>(totalLen);
+        msgReader->readNextBlock(recvMem.get()->getBuf());
+      }
+    }
+    recvSyncBarrier_->wait();
+  }
+}
+
+void ParameterClient2::waitPassStart() {
+  WaitPassStartRequest request;
+  std::vector<WaitPassStartResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::waitPassFinish() {
+  WaitPassFinishRequest request;
+  std::vector<WaitPassFinishResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::synchronize(SyncObject syncObjectId) {
+  SynchronizeRequest request;
+  request.set_sync_object_id(syncObjectId);
+  std::vector<SynchronizeResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::asyncFinishPass(SyncObject syncObjectId) {
+  SynchronizeRequest request;
+  request.set_sync_object_id(syncObjectId);
+  request.set_trainer_id(trainerId_);
+  std::vector<SynchronizeResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::setConfig(const OptimizationConfig& optConfig,
+                                 const std::string& saveDir,
+                                 bool isSparseServer) {
+  SetConfigRequest request;
+  std::vector<SetConfigResponse> responses;
+
+  for (auto& nameAndPara : parameterMap_) {
+    *request.add_param_configs() = nameAndPara.second->getConfig();
+  }
+
+  *request.mutable_opt_config() = optConfig;
+  request.set_save_dir(saveDir);
+  request.set_is_sparse_server(isSparseServer);
+
+  std::vector<SetConfigRequest> requests;
+  requests.resize(clients_.size());
+  for (size_t i = 0; i < requests.size(); ++i) {
+    requests[i].CopyFrom(request);
+    requests[i].set_server_id(i);
+  }
+
+  responses.resize(clients_.size());
+  size_t numClients = clients_.size();
+  for (size_t i = 0; i < numClients; ++i) {
+    clients_[i].send(__func__, requests[i]);
+  }
+  for (size_t i = 0; i < numClients; ++i) {
+    clients_[i].recv(&responses[i]);
+  }
+}
+
+bool ParameterClient2::inStatus(PServerStatus status) {
+  GetStatusRequest request;
+  std::vector<GetStatusResponse> responses;
+
+  bool ok = true;
+  multiCall("getStatus", request, &responses);
+  for (auto& response : responses) {
+    if (response.status() != status) {
+      ok = false;
+    }
+  }
+
+  return ok;
+}
+
+void ParameterClient2::setStatus(PServerStatus status) {
+  SetStatusRequest request;
+  request.set_status(status);
+  std::vector<SetStatusResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::waitForStatus(PServerStatus status) {
+  while (!inStatus(status)) {
+    sleep(1);
+  }
+}
+
+template <typename Proto>
+static void validateResponses(const std::vector<Proto>& responses) {
+  for (auto& response : responses) {
+    CHECK(response.return_message().empty())
+        << "client" << &response - &responses[0]
+        << " error:" << response.return_message();
+  }
+}
+
+PServerVector ParameterClient2::createVector() {
+  CreateVectorRequest request;
+  std::vector<CreateVectorResponse> responses;
+  int64_t handle = -1;
+
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+
+  for (auto& response : responses) {
+    if (handle == -1) {
+      handle = response.handle();
+    } else {
+      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
+                                          << &response - &responses[0] << " "
+                                          << handle << " " << response.handle();
+    }
+  }
+  return PServerVector{handle};
+}
+
+void ParameterClient2::releaseVector(PServerVector handle) {
+  ReleaseVectorRequest request;
+  std::vector<ReleaseVectorResponse> responses;
+
+  request.set_handle(handle.handle);
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+PServerMatrix ParameterClient2::createMatrix(int32_t numCols) {
+  CreateMatrixRequest request;
+  std::vector<CreateMatrixResponse> responses;
+  int64_t handle = -1;
+
+  request.set_num_cols(numCols);
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+
+  for (auto& response : responses) {
+    if (handle == -1) {
+      handle = response.handle();
+    } else {
+      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
+                                          << &response - &responses[0] << " "
+                                          << handle << " " << response.handle();
+    }
+  }
+  return PServerMatrix{handle};
+}
+
+void ParameterClient2::releaseMatrix(PServerMatrix handle) {
+  ReleaseMatrixRequest request;
+  std::vector<ReleaseMatrixResponse> responses;
+
+  request.set_handle(handle.handle);
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+void PreparedOperations::addOperationHelper(Operation* op, CpuVectorPtr vec) {
+  ProtoVector& pvec = *op->add_vectors();
+  size_t dim = vec->getSize();
+  pvec.set_dim(dim);
+  copyToRepeatedField(pvec.mutable_values(), vec->getData(), vec->getSize());
+}
+
+void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) {
+  ProtoMatrix& pmat = *op->add_matrices();
+  pmat.set_num_cols(mat->getWidth());
+  pmat.set_num_rows(mat->getHeight());
+  copyToRepeatedField(
+      pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows());
+}
+
+static inline real addTwo(real a, double b) { return a + b; }
+
+void ParameterClient2::doOperation(PreparedOperations& ops,
+                                   bool waitForGradient,
+                                   bool sendBackGradient,
+                                   bool releasePass) {
+  std::vector<DoOperationResponse> responses;
+  ops.request_.set_wait_for_gradient(waitForGradient);
+  ops.request_.set_send_back_parameter(sendBackGradient);
+  ops.request_.set_release_pass(releasePass);
+  multiCall(__func__, ops.request_, &responses);
+  validateResponses(responses);
+  size_t numPassFinishServers = 0;
+
+  size_t numOps = ops.request_.operations_size();
+  for (auto& response : responses) {
+    numPassFinishServers += response.pass_finish();
+    CHECK_EQ(numOps, (size_t)response.results_size());
+    for (size_t opId = 0; opId < numOps; ++opId) {
+      const OperationResult& result = response.results(opId);
+      std::vector<real*>& resultScalars = ops.localResults_[opId].resultScalars;
+      std::vector<CpuVectorPtr>& resultVectors =
+          ops.localResults_[opId].resultVectors;
+      std::vector<CpuMatrixPtr>& resultMatrices =
+          ops.localResults_[opId].resultMatrices;
+
+      if (&response == &responses[0]) {
+        /// Initialize results to zero
+
+        resultScalars.resize(result.scalars_size());
+        for (auto p : resultScalars) {
+          if (!p) continue;
+          *p = 0;
+        }
+        size_t numVectors = result.vectors_size();
+        resultVectors.resize(numVectors);
+        for (size_t i = 0; i < numVectors; ++i) {
+          if (!resultVectors[i]) continue;
+          resultVectors[i]->resize(result.vectors(i).dim());
+          resultVectors[i]->zeroMem();
+        }
+        size_t numMatrices = result.matrices_size();
+        resultMatrices.resize(numMatrices);
+        for (size_t i = 0; i < numMatrices; ++i) {
+          if (!resultMatrices[i]) continue;
+          resultMatrices[i]->resize(result.matrices(i).num_rows(),
+                                    result.matrices(i).num_cols());
+          resultMatrices[i]->zeroMem();
+        }
+      }
+
+      // aggregate results from each pserver to results
+
+      CHECK_EQ(resultScalars.size(), (size_t)result.scalars_size());
+      for (ssize_t i = 0; i < result.scalars_size(); ++i) {
+        real* rscalar = resultScalars[i];
+        if (!rscalar) continue;
+        *rscalar += result.scalars(i);
+      }
+
+      CHECK_EQ(resultVectors.size(), (size_t)result.vectors_size());
+      for (auto& vec : result.vectors()) {
+        int i = &vec - &result.vectors(0);
+        CpuVectorPtr rvec = resultVectors[i];
+        if (!rvec) continue;
+        CHECK_EQ(rvec->getSize(), (size_t)vec.dim());
+        std::transform(rvec->getData(),
+                       rvec->getData() + rvec->getSize(),
+                       vec.values().data(),
+                       rvec->getData(),
+                       addTwo);
+      }
+
+      CHECK_EQ(resultMatrices.size(), (size_t)result.matrices_size());
+      for (auto& mat : result.matrices()) {
+        int i = &mat - &result.matrices(0);
+        CpuMatrixPtr rmat = resultMatrices[i];
+        if (!rmat) continue;
+        CHECK_EQ(rmat->getHeight(), (size_t)mat.num_rows());
+        CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols());
+
+        std::transform(rmat->getData(),
+                       rmat->getData() + rmat->getElementCnt(),
+                       mat.values().data(),
+                       rmat->getData(),
+                       addTwo);
+      }
+    }
+  }
+  passFinish_ = numPassFinishServers == clients_.size();
+}
+
+real ParameterClient2::vectorDotProduct(PServerVector u, PServerVector v) {
+  real result = 0.0;
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_utv, u, v)(&result);
+  doOperation(ops, false, false);
+  return result;
+}
+
+void ParameterClient2::vectorScale(PServerVector u, real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au, u, a);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorCopy(PServerVector src, PServerVector dst) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_COPY, src, dst);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorAddMult(PServerVector u, PServerVector v, real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)1);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorAddMultInto(PServerVector u,
+                                         PServerVector v,
+                                         PServerVector w,
+                                         real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au_bv_cw, v, w, u, (real)1, a, (real)0);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorScaleInto(PServerVector u,
+                                       PServerVector v,
+                                       real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)0);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::loadValueVector(const std::string& dirName) {
+  LoadValueRequest request;
+  request.set_dir_name(dirName);
+  std::vector<LoadValueResponse> responses;
+
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+void ParameterClient2::saveValueVector(const std::string& dirName) {
+  SaveValueRequest request;
+  request.set_dir_name(dirName);
+  std::vector<SaveValueResponse> responses;
+
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterClient2.h b/paddle/legacy/pserver/ParameterClient2.h
new file mode 100644
index 0000000000000000000000000000000000000000..2bc0e478664c0d2f6cf5d38f75bd14e25c1724c6
--- /dev/null
+++ b/paddle/legacy/pserver/ParameterClient2.h
@@ -0,0 +1,602 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/pserver/BaseClient.h"
+#include "paddle/utils/Common.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Queue.h"
+#include "paddle/utils/Util.h"
+
+#include "ParameterService.pb.h"
+
+#include "ProtoServer.h"
+#include "SparseParameterDistribution.h"
+
+DECLARE_int32(parallel_thread_num);
+
+namespace paddle {
+
+struct PServerMatrix {
+  int64_t handle;
+};
+
+struct PServerVector {
+  int64_t handle;
+};
+
+/**
+ * @brief A class to help to prepare server-side operations.
+ */
+class PreparedOperations {
+ protected:
+  class ResultsAdder;
+  struct LocalOperationResult;
+
+ public:
+  /**
+   * Offers an easy way to prepare operations that will be performed on
+   * server-side.
+   *
+   * Usage:
+   * @code
+   *   addOperation(optype, arguments...)(results...)
+   * @endcode
+   *
+   * Examples:
+   * 1. set pserver vector to 1:
+   * @code
+   *   PServerVector u = parameterClient.createVector();
+   *   addOperation(PSERVER_OP_RESET, u, (real)1);
+   * @endcode
+   *
+   * 2. Compute inner product of to pserver vectors.
+   * @code
+   *   PServerVector u = parameterClient.createVector();
+   *   PServerVector v = parameterClient.createVector();
+   *   real result;
+   *   addOperation(PSERVER_OP_utv, u, v)(&result)
+   * @endcode
+   *
+   * @param[in] operation The operation that pserver will perform.
+   * @param[in] args Argument list of the operation
+   * @return A ResultsAdder object initialized with the last element of
+   *         localResults_.
+   */
+  template <typename... Args>
+  ResultsAdder addOperation(MatrixVectorOperation operation, Args... args) {
+    Operation* op = request_.add_operations();
+    op->set_operation(operation);
+    localResults_.emplace_back();
+    addOperationHelper(op, args...);
+    return ResultsAdder(&localResults_.back());
+  }
+
+ protected:
+  void addOperationHelper(Operation* op) {}
+
+  /**
+   * @brief Helper function to add an new operation that takes a PServerVector
+   *        as an operand.
+   */
+  void addOperationHelper(Operation* op, PServerVector arg) {
+    op->add_pvectors(arg.handle);
+  }
+
+  /**
+   * @brief Helper function to add an new operation that takes a PServerMatrix
+   *        as an operand.
+   */
+  void addOperationHelper(Operation* op, PServerMatrix arg) {
+    op->add_pmatrices(arg.handle);
+  }
+
+  /**
+   * @brief Helper function to add an new operation that takes a real valued
+   *        scalar as an operand.
+   */
+  void addOperationHelper(Operation* op, real arg) { op->add_scalars(arg); }
+
+  /**
+   * @brief Helper function to add an new operation that takes a CpuVectorPtr
+   *        as an operand.
+   * @note The array of CpuVectors that arg points to will be copied to
+   *       op's vectors field.
+   */
+  void addOperationHelper(Operation* op, CpuVectorPtr arg);
+
+  /**
+   * @brief Helper function to add an new operation that takes a CpuMatrixPtr
+   *        as an operand.
+   * @note The array of CpuMatrixs that arg points to will be copied to
+   *       op's matrices field.
+   */
+  void addOperationHelper(Operation* op, CpuMatrixPtr arg);
+
+  /**
+   * @brief Helper function to add an new operation and prepare the operands.
+   *
+   * @tparam Arg An operand of the operation.
+   * @tparam Args A list of rest operands of the operation.
+   * @param op Pointer to an Operation object.
+   */
+  template <typename Arg, typename... Args>
+  void addOperationHelper(Operation* op, Arg arg, Args... args) {
+    addOperationHelper(op, arg);
+    addOperationHelper(op, args...);
+  }
+
+  /**
+   * @brief ResultsAdder offers easy ways to quickly store operation results.
+   */
+  class ResultsAdder {
+   public:
+    explicit ResultsAdder(LocalOperationResult* localResult)
+        : localResult_(localResult) {}
+    template <typename... Args>
+    void operator()(Args... args) {
+      addResult(args...);
+    }
+    void addResult() {}
+    void addResult(real* arg) { localResult_->resultScalars.push_back(arg); }
+    void AddResult(CpuVectorPtr arg) {
+      localResult_->resultVectors.push_back(arg);
+    }
+    void AddResult(CpuMatrixPtr arg) {
+      localResult_->resultMatrices.push_back(arg);
+    }
+    template <typename Arg, typename... Args>
+    void addResult(Arg arg, Args... args) {
+      addResult(arg);
+      addResult(args...);
+    }
+
+   protected:
+    LocalOperationResult* localResult_;
+  };
+
+ protected:
+  DoOperationRequest request_;
+  std::vector<iovec> inputIovs_;
+  struct LocalOperationResult {
+    std::vector<real*> resultScalars;
+    std::vector<CpuVectorPtr> resultVectors;
+    std::vector<CpuMatrixPtr> resultMatrices;
+  };
+  std::vector<LocalOperationResult> localResults_;
+  friend class ParameterClient2;
+};
+
+struct ParameterSegments {
+  std::string name;  // name of the parameter
+  size_t id;         // id of the parameter
+};
+
+/**
+ * The client interface for parameter server. ParameterClient2 supports 2 modes
+ * for managing connections to parameter servers, in the 1st mode one connection
+ * is shared by 2 threads that are separately responsible for sending and
+ * recieving activities, in the 2nd mode one connection is owned by only one
+ * thread, and all the sending and recieving activities run in that single
+ * thread.
+ * TODO(yanfei):
+ * Additional core idea to further optimizate pserver performance is
+ * to do sync-sgd based parameter level instead of pserver level.
+ * full-parallelization based parameter level for sync-sgd also can
+ * sense forwardbackward computation layer-by-layer for more deeper layer
+ * model.
+ * Firstly, pserver can do full-parallelization on all computation based
+ * parameter level instead of waiting for all gradients are finished and
+ * start to send back parameters value immediately if parameter is ready
+ * instead of waiting for all parameters value are ready
+ * Secondly, parameter client can write back parameters to GPU instead of
+ * waiting until all parameters are received to CPU host end.
+ */
+class ParameterClient2 : public BaseClient {
+ public:
+  /** Constructor.
+   * @param separate True if sending and recieving activities are separated
+   *                 into 2 threads, otherwise false.
+   * @param port Port number that parameter client runs on.
+   * @param numPorts Number of ports parameter clients occupies,
+   *                 numPorts * pserver number is the total number of
+   *                 connections the parameter client maintains.
+   */
+  ParameterClient2(bool separate = false,
+                   int port = FLAGS_port,
+                   int numPorts = FLAGS_ports_num);
+
+  ~ParameterClient2();
+
+  static int calcParameterBlockSize(const std::vector<ParameterPtr>& parameters,
+                                    size_t serviceNum);
+
+ public:
+  bool init(const std::vector<ParameterPtr>& parameters);
+
+  /// service functions
+
+  /**
+   * @brief Sends the segments in parameter to parameter servers, then receives
+   *        the response from the servers.
+   * @param[in] updateMode Indicates how parameters should be updated on the
+   *            server side.
+   * @param[in] parameterType Type of parameter that will be sent.
+   * @param[in] segments Segments in the parameter that will be sent.
+   * @param[in] numSamples Number of samples this update is based on.
+   * @param[in] cost Cost of the batch, will be used to calculate global object
+   *            value.
+   * @param[in] sendBackParameter True if the updated parameters should be sent
+   *            back, otherwise false.
+   * @param[in] sendBackParameterType Send back parameter type on pserver,
+   *            PARAMETER_VALUE by default
+   * @param[in] recvParameterType pserver[sendBackParameterType] will be copy to
+   *            client[recvParameterType]
+   * @note Only parameterType will be sent.
+   */
+  void sendAndReceiveParameter(ParameterUpdateMode updateMode,
+                               ParameterType parameterType,
+                               const std::vector<ParameterSegments>& segments,
+                               int64_t numSamples,
+                               real cost,
+                               bool sendBackParameter,
+                               ParameterType sendBackParameterType,
+                               ParameterType recvParameterType);
+
+  /**
+   * @brief Sends all parameters to parameter servers, and receives the response
+   *        from the servers.
+   */
+  void sendAndReceiveParameter(
+      ParameterUpdateMode updateMode,
+      ParameterType parameterType,
+      int64_t numSamples,
+      real cost,
+      bool sendBackParameter,
+      ParameterType sendBackParameterType = PARAMETER_VALUE,
+      ParameterType recvParameterType = PARAMETER_VALUE) {
+    sendAndReceiveParameter(updateMode,
+                            parameterType,
+                            allSegments_,
+                            numSamples,
+                            cost,
+                            sendBackParameter,
+                            sendBackParameterType,
+                            recvParameterType);
+  }
+
+  /**
+   * @brief Sends the segments in parameter to parameter servers. Each
+   *        sendParameter() must be paired with a recvParameter() in the future.
+   *        Only parameterType will be sent.
+   *
+   * @param[in] updateMode Indicates how parameters should be updated on the
+   *            server side.
+   * @param[in] parameterType Type of parameter that will be sent.
+   * @param[in] segments Segments in the parameter that will be sent.
+   * @param[in] numSamples Number of samples this update is based on.
+   * @param[in] cost Cost of the batch, will be used to calculate global object
+   *            value.
+   * @param[in] sendBackParameter True if the updated parameters should be sent
+   *            back, otherwise false.
+   * @param[in] batchStatus Status of the batch.
+   * @note This function is non-blocking. This means that parameter should
+   *       not change between this call and recvParameter()
+   */
+  void sendParameter(ParameterUpdateMode updateMode,
+                     ParameterType parameterType,
+                     const std::vector<ParameterSegments>& segments,
+                     int64_t numSamples,
+                     real cost,
+                     bool sendBackParameter,
+                     BatchStatus batchStatus);
+
+  void recvParameter();
+
+  /**
+   * Sends all parameters to parameter servers, recvParameter() have to be
+   * invoked
+   * afterwards.
+   *
+   * @note This function is non-blocking. This means that if parameter should
+   *       not changes between this call and recvParameter()
+   */
+  void sendParameter(ParameterUpdateMode updateMode,
+                     ParameterType parameterType,
+                     int64_t numSamples,
+                     real cost,
+                     bool sendBackParameter,
+                     BatchStatus batchStatus) {
+    sendParameter(updateMode,
+                  parameterType,
+                  allSegments_,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  batchStatus);
+  }
+
+  /// Get all parameters from parameter servers
+  void getParameter(ParameterType recvParameterType = PARAMETER_VALUE,
+                    ParameterType sendBackParameterType = PARAMETER_VALUE) {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
+                            PARAMETER_VALUE,
+                            0,     // numSamples = 0
+                            0,     // cost = 0
+                            true,  // sendBackParameter = true
+                            sendBackParameterType,
+                            recvParameterType);
+  }
+
+  /// Get parameters by sparse row ids from parameter servers
+  void getParameterSparse(
+      ParameterType recvParameterType = PARAMETER_VALUE,
+      ParameterType sendBackParameterType = PARAMETER_VALUE) {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM_SPARSE,
+                            PARAMETER_VALUE,
+                            0,     // numSamples = 0
+                            0,     // cost = 0
+                            true,  // sendBackParameter = true
+                            sendBackParameterType,
+                            recvParameterType);
+  }
+
+  /// Set all parameters on parameter servers using the local parameters
+  void setParameter() {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
+                            PARAMETER_VALUE,
+                            0,       // numSamples = 0
+                            0,       // cost = 0
+                            false);  // sendBackParameter = false
+  }
+  /**
+   * Set all parameters on parameter servers, values will be zero
+   * means do not sending local parameters
+   */
+  void setParameterZero() {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO,
+                            PARAMETER_VALUE,
+                            0,       // numSamples = 0
+                            0,       // cost = 0
+                            false);  // sendBackParameter = false
+  }
+
+  /**
+   * @brief Wait until all gradient servers start one pass.
+   *
+   * @note This is now only used by the gradient servers for "sgd"
+   *       algorithm. Calling this function means that the calling gradient
+   *       server is ready to start a new pass.
+   */
+  void waitPassStart();
+
+  /**
+   * @brief Wait until all gradient servers finish one pass.
+   *
+   * @note This is now only used by the gradient servers for "sgd" algorithm.
+   *       Calling this function means that the calling gradient server
+   *       finishes one pass.
+   */
+  void waitPassFinish();
+
+  /// Wait until all gradient servers call this function.
+  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
+
+  /// Called when async-sgd finish pass.
+  void asyncFinishPass(SyncObject syncObjectId = SYNC_DEFAULT);
+
+  void asyncStartPass(SyncObject syncObjectId = SYNC_DEFAULT) {
+    return synchronize(syncObjectId);
+  }
+
+  /**
+   * @brief Execute the prepared operations on pservers, fetch the results and
+   *        aggregate results from different pservers.
+   * @param[in] ops Prepared operations that will be executed on pservers.
+   * @param[in] waitForGradient If true, wait for gradient to be ready before
+   *            starting the operations.
+   * @param[in] sendBackParameter If true, send back the parameter to clients
+   *            after the operations are finished.
+   * @param[in] If true, and if all clients call waitPassFinish, signal all
+   *            clients finish the pass.
+   */
+  void doOperation(PreparedOperations& ops,
+                   bool waitForGradient,
+                   bool sendBackParameter,
+                   bool releasePass = true);
+
+  /**
+   * Set the configuration of pserver, including parameter config and
+   * optimization config
+   */
+  void setConfig(const OptimizationConfig& optConfig,
+                 const std::string& saveDir = "",
+                 bool isSparseServer = false);
+
+  /// Return true if all pservers are in the given status
+  bool inStatus(PServerStatus status);
+  bool isPassFinish() { return passFinish_; }
+
+  /// Set pserver status
+  void setStatus(PServerStatus status);
+
+  /**
+   * @brief Wait until all pservers are at status
+   * @note This function is not suitable for frequent use,
+   *       because it sleeps 1 second each time when condition is satisfied.
+   */
+  void waitForStatus(PServerStatus status);
+
+  /// Create a column vector. The size is the dimension of parameter.
+  PServerVector createVector();
+
+  /// Release the PServerVector given handle.
+  void releaseVector(PServerVector handle);
+
+  /**
+   * Create a column major matrix. The number of rows is the dimension of
+   * parameter. The number of columns is specifed by numCols.
+   */
+  PServerMatrix createMatrix(int32_t numCols);
+
+  /// Release the PServerMatrix given handle.
+  void releaseMatrix(PServerMatrix handle);
+
+  // Some basic algebra functions
+  /// Calculate the dot product of u and v
+  real vectorDotProduct(PServerVector u, PServerVector v);
+
+  /// Scale u by a
+  void vectorScale(PServerVector u, real a);
+
+  /// Copy from src to dest
+  void vectorCopy(PServerVector src, PServerVector dst);
+
+  /// u += v * a
+  void vectorAddMult(PServerVector u, PServerVector v, real a);
+
+  /// u = v + w * a
+  void vectorAddMultInto(PServerVector u,
+                         PServerVector v,
+                         PServerVector w,
+                         real a);
+  /// u = v * a
+  void vectorScaleInto(PServerVector u, PServerVector v, real a);
+
+  /// Return pserver parameter value.
+  PServerVector getPServerParameterValue() {
+    PServerVector vec;
+    vec.handle = PARAMETER_VALUE;
+    return vec;
+  }
+
+  /// Return pserver parameter gradient.
+  PServerVector getPServerParameterGradient() {
+    PServerVector vec;
+    vec.handle = PARAMETER_GRADIENT;
+    return vec;
+  }
+
+  /**
+   * Tell pservers to load value vector from file.
+   *
+   * @param[in] dirName The directory that contains the value vector file.
+   */
+  void loadValueVector(const std::string& dirName);
+
+  /// Tell pservers to save value vector to file.
+  void saveValueVector(const std::string& dirName);
+
+  void setTrainerId(int trainerId) { trainerId_ = trainerId; }
+
+#ifndef PADDLE_DISABLE_TIMER
+  void setForwardbackwardTime(uint64_t delta) { forwardbackwordTime_ = delta; }
+#endif
+
+ protected:
+  template <typename ProtoIn, typename ProtoOut>
+  void multiCall(const char* funcName,
+                 const ProtoIn& request,
+                 std::vector<ProtoOut>* responses) {
+    responses->resize(clients_.size());
+    size_t numClients = clients_.size();
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].send(funcName, request);
+    }
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].recv(&(*responses)[i]);
+    }
+  }
+
+ private:
+  void destroy();
+
+  /**
+   * @brief management function for parallelizing send/recv all connections
+   *        to all pservers. it is called under one SyncThreadPool. it
+   *        supports to use N thread to control M connections. the receiving
+   *        actions can be started until all sending action to all connections
+   *        owned by current thread are finished. Different connections
+   * controlled
+   *        by different threads can transfer data asynchronously.
+   */
+  void sendParallel(int tid,
+                    size_t numThreads,
+                    ParameterType recvParameterType);
+  /// sending thread routine for asynchronously send data
+  void send(int threadId);
+  /// receiving thread routing for asynchronously receive data
+  void recv(int threadId);
+
+  /**
+   * @brief main routine to build data for pserver
+   *
+   * @note  it can prepare different kinds of parameter type data. it can
+   *        be regarded as layer for bridging real parameters data and
+   *        protobuf data for communication.
+   *        TODO(yanfei):
+   *        can abstract additional layer to encode and decode data to/from
+   *        protobuf data.
+   */
+  void prepareSendData(
+      ParameterUpdateMode updateMode,
+      ParameterType parameterType,  // client send type
+      const std::vector<ParameterSegments>& parameterSegments,
+      int64_t numSamples,
+      real cost,
+      bool sendBackParameter,
+      ParameterType sendBackParameterType,  // send back type in pserver
+      BatchStatus batchStatus,
+      SendJob* sendJob);
+
+  /// start necessary threads for threadPool
+  void initThreads();
+
+ protected:
+  /// start port number of pserver
+  /// it deduce all ports for dense and sparse with some rules
+  int port_;
+  /// identify the trainer id using this client
+  int trainerId_;
+
+#ifndef PADDLE_DISABLE_TIMER
+  uint64_t forwardbackwordTime_;
+#endif
+  std::mutex sparseAutoGrowthMutex_;
+
+  /// map id to parameter used for decoding protobuf data
+  std::unordered_map<size_t, ParameterPtr> parameterMap_;
+  /// segments for all parameters that needed to sync
+  std::vector<ParameterSegments> allSegments_;
+
+  /// module for sensing sparse parameters distribution on all pservers
+  std::unique_ptr<SparseParameterDistribution> sparseDistribution_;
+
+  /// thread pool for parallelizing all connections to pservers
+  std::unique_ptr<SyncThreadPool> syncThreadPool_;
+
+  bool passFinish_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterServer2.cpp b/paddle/legacy/pserver/ParameterServer2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..293fc7ca69bed0fa59bf6972fbe9908967842acf
--- /dev/null
+++ b/paddle/legacy/pserver/ParameterServer2.cpp
@@ -0,0 +1,1401 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterServer2.h"
+
+#include <algorithm>
+#include <fstream>
+
+#include "paddle/legacy/math/SIMDFunctions.h"
+#include "paddle/legacy/parameter/AverageOptimizer.h"
+#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
+#include "paddle/legacy/parameter/OptimizerFunctions.h"
+#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
+#include "paddle/legacy/parameter/ParameterOptimizer.h"
+#include "paddle/legacy/parameter/ParameterUpdateFunctions.h"
+#include "paddle/legacy/parameter/Regularizer.h"
+#include "paddle/legacy/parameter/ThreadLocalBuffer.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/GlobalConstants.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/StringUtil.h"
+
+DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
+DEFINE_double(async_lagged_ratio_min,
+              1.0,
+              "control config_.async_lagged_grad_discard_ratio() min value");
+DEFINE_double(
+    async_lagged_ratio_default,
+    1.5,
+    "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
+    "use it as defalut value");
+
+namespace paddle {
+
+const std::string ParameterServer2::kRetMsgInvalidMatrixHandle =
+    "Invalid matrix handle";
+const std::string ParameterServer2::kRetMsgInvalidVectorHandle =
+    "Invalid vector handle";
+const std::string ParameterServer2::kRetMsgUnknownOperation =
+    "Unknown operation";
+
+ParameterServer2::ParameterServer2(const std::string& addr,
+                                   int port,
+                                   int rdmaCpu)
+    : ProtoServer(addr, port, rdmaCpu),
+      dataSize_(0),
+      size_(0),
+      gradientReadyBarrier_(FLAGS_num_gradient_servers + 1),
+      parameterReadyBarrier_(FLAGS_num_gradient_servers + 1),
+      passBarrier_(FLAGS_num_gradient_servers + 1),
+      numPassFinishClients_(0),
+      allClientPassFinish_(false),
+      serverId_(-1),
+      batchId_(-1) {
+  /**
+   * register function for remote client calling, these functions
+   * will be mapped to a data structure for quick looking up. each
+   * request from trainer can contains one function name to indicate
+   * remote action. this architecture looks like rpc style for pserver.
+   */
+  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendParameter);
+  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendData);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, setConfig);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, setStatus);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, getStatus);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, doOperation);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, createVector);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseVector);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, createMatrix);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseMatrix);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassStart);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassFinish);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, synchronize);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, asyncFinishPass);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, loadValueVector);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, saveValueVector);
+
+  /// thread pool for parallelizing some computations
+  if (FLAGS_pserver_num_threads > 1) {
+    syncThreadPool_.reset(new SyncThreadPool(FLAGS_pserver_num_threads, false));
+  }
+}
+
+bool ParameterServer2::init() {
+  vectors_.resize(NUM_PARAMETER_TYPES);
+  configMap_.clear();
+
+  numSamplesProcessed_ = 0;
+  cost_ = 0;
+  char* mpienv = getenv("OMPI_COMM_WORLD_SIZE");
+  if (mpienv != NULL) {
+    mpiSize_ = atoi(mpienv);
+  } else {
+    mpiSize_ = 1;
+  }
+  status_ = PSERVER_STATUS_NOT_SET;
+  dataMems_.resize(FLAGS_num_gradient_servers);
+  synchronizeBarriers_.resize(SyncObject_ARRAYSIZE);
+  for (auto& barrier : synchronizeBarriers_) {
+    barrier.reset(new ThreadBarrier(FLAGS_num_gradient_servers));
+  }
+
+  // initialization for dicarding lagging gradient
+  asyncUpdateSteps_ = 0;
+  asyncTrainerSteps_.resize(FLAGS_num_gradient_servers);
+  asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
+  asyncLaggedGradientsNum_ = 0;
+  asyncUpdateStat_.resize(static_cast<int>(FLAGS_num_gradient_servers *
+                                           FLAGS_async_lagged_ratio_default));
+  asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
+  asyncTrainerDiscardStat_.resize(FLAGS_num_gradient_servers);
+  asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
+  asyncTrainerCommitStat_.resize(FLAGS_num_gradient_servers);
+  asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
+
+  return true;
+}
+
+void ParameterServer2::getStatus(const GetStatusRequest& request,
+                                 ProtoResponseCallback callback) {
+  (void)request;
+  GetStatusResponse response;
+  response.set_status(status_);
+  callback(response);
+}
+
+void ParameterServer2::setStatus(const SetStatusRequest& request,
+                                 ProtoResponseCallback callback) {
+  status_ = request.status();
+  SetStatusResponse response;
+  callback(response);
+}
+
+void ParameterServer2::setConfig(const SetConfigRequest& request,
+                                 ProtoResponseCallback callback) {
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+
+    serverId_ = request.server_id();
+    isSparseServer_ = request.is_sparse_server();
+
+    if (!request.save_dir().empty()) {
+      mkDir(request.save_dir().c_str());
+    }
+
+    for (const auto& config : request.param_configs()) {
+      CHECK(!configMap_.count(config.para_id()))
+          << "Duplicated parameter name: " << config.name();
+      configMap_[config.para_id()] = config;
+      CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
+    }
+
+    config_ = request.opt_config();
+    if (config_.algorithm() == TrainAlgorithm::AsyncSGD) {
+      auto asyncLaggedRatio = config_.async_lagged_grad_discard_ratio();
+      if (asyncLaggedRatio <= FLAGS_async_lagged_ratio_min) {
+        LOG(INFO) << "WARNING: async_lagged_grad_discard_ratio is too small"
+                  << "reset to default, async_lagged_grad_discard_ratio = "
+                  << FLAGS_async_lagged_ratio_default;
+        asyncLaggedRatio = FLAGS_async_lagged_ratio_default;
+      }
+      asyncLaggedThreshold_ =
+          static_cast<int64_t>(FLAGS_num_gradient_servers * asyncLaggedRatio);
+      LOG(INFO) << "discard lagged async gradient ratio: " << asyncLaggedRatio
+                << " asyncLaggedhreshold: " << asyncLaggedThreshold_;
+    }
+    if (isSparseServer_ && config_.num_batches_per_send_parameter() > 1) {
+      /// sparse server must NOT use local update mode
+      config_.set_num_batches_per_send_parameter(1);
+    }
+
+    if (config_.num_batches_per_send_parameter() > 1 &&
+        config_.center_parameter_update_method() == "average") {
+      /// scaling L1/L2 decay rate as large as L1/L2 apply in trainer
+      /// if parameter regularization in pserver
+      for (auto& pair : configMap_) {
+        ParameterConfig& config = pair.second;
+        if (config_.num_batches_per_send_parameter() ==
+            config.num_batches_regularization()) {
+          real scale =
+              config_.delta_add_rate() * config.num_batches_regularization();
+          if (config_.algorithm() == "sgd") {
+            scale *= FLAGS_num_gradient_servers;
+          }
+          config.set_decay_rate(config.decay_rate() * scale);
+          if (config.decay_rate() > 0.1f) {
+            LOG(FATAL) << "L2 decay=" << config.decay_rate()
+                       << " for parameter:" << config.name()
+                       << " is too large after scale in pserver!";
+          }
+          config.set_decay_rate_l1(config.decay_rate_l1() * scale);
+          if (config.decay_rate_l1() > 0.1f) {
+            LOG(FATAL) << "L1 decay=" << config.decay_rate_l1()
+                       << " for parameter:" << config.name()
+                       << " is too large after scale in pserver!";
+          }
+
+          LOG(INFO) << "parameter:" << config.name()
+                    << " decay apply in pserver,"
+                    << " L1 decay=" << config.decay_rate_l1()
+                    << " L2 decay=" << config.decay_rate();
+        }
+      }
+    }
+  }
+
+  SetConfigResponse response;
+  callback(response);
+}
+
+real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
+  real sum = 0;
+  for (const auto buffer : buffers) {
+    for (size_t i = 0; i < buffer.size; ++i) {
+      sum += buffer.base[i];
+    }
+  }
+  return sum;
+}
+
+void ParameterServer2::mergeSegments(BlockSegments* segments) {
+  if (segments->empty()) {
+    return;
+  }
+  std::sort(segments->begin(), segments->end());
+  auto curr = segments->begin();
+  for (auto it = segments->begin(); it != segments->end(); ++it) {
+    if (it->first <= curr->second) {
+      curr->second = std::max(curr->second, it->second);
+    } else {
+      ++curr;
+      *curr = *it;
+    }
+  }
+  ++curr;
+  segments->erase(curr, segments->end());
+}
+
+void ParameterServer2::setParameter(const SendParameterRequest& request,
+                                    std::vector<Buffer>& inputBuffers,
+                                    SendParameterResponse* response,
+                                    std::vector<Buffer>* outputBuffers) {
+  (void)response;
+  (void)outputBuffers;
+  LOG(INFO) << "pserver: setParameter";
+  std::lock_guard<RWLock> guard(parameterMutex_);
+
+  int64_t numBlocks = blockIdMap_.size();
+  CHECK_EQ(blockIdMap_.size(), blockOffsetMap_.size());
+  /// total bytes for all the added blocks
+  int64_t totalSize = size_;
+  std::vector<int64_t> offsets;
+  offsets.reserve(request.blocks_size());
+  std::vector<int64_t> blockIds;
+  blockIds.reserve(request.blocks_size());
+  int bufferIndex = 0;
+
+  if (!request.blocks().size()) {
+    LOG(WARNING)
+        << "--ports_num or --ports_num_for_sparse might be too large, "
+        << "or total dense parameter size or sparse parameters size "
+        << "might be too small, this psever doesn't store any parameter.";
+    return;
+  }
+
+  for (const auto& block : request.blocks()) {
+    /// block size for parameter(e.g. 128 for sparse row, 1K for dense)
+    uint64_t blockSize = getParameterConfig(block).parameter_block_size();
+    BlockKey key(block.para_id(), block.block_id());
+    if (inputBuffers.size()) {  // if !=PSERVER_UPDATE_MODE_SET_PARAM_ZERO
+      Buffer buffer = inputBuffers[bufferIndex];
+      ++bufferIndex;
+      CHECK_EQ(buffer.size, block.block_size())
+          << "data size is too big:"
+          << " block_size=" << block.block_size()
+          << " data_size=" << buffer.size;
+    }
+
+    /// add a new block
+    if (blockIdMap_.count(key) == 0) {
+      blockOffsetMap_[key] = totalSize;
+      blockIdMap_[key] = numBlocks;
+      ++numBlocks;
+      totalSize += blockSize;
+    }
+    offsets.push_back(blockOffsetMap_[key]);
+    blockIds.push_back(blockIdMap_[key]);
+  }
+
+  size_ = totalSize;
+  LOG(INFO) << "pserver: new cpuvector: size=" << size_;
+  if (!vectors_[PARAMETER_VALUE]) {
+    /// vectors_
+    const auto types = sgdOptimizerGetTypes(config_, true /*inPserver*/);
+    for (const auto type : types) {
+      vectors_[type].reset(new CpuVector(size_));
+      vectors_[type]->zeroMem();
+    }
+
+    blockInfos_.resize(numBlocks);
+    for (auto& info : blockInfos_) {
+      info.lock.reset(new std::mutex());
+    }
+  } else {
+    CHECK_EQ((size_t)size_, vectors_[PARAMETER_VALUE]->getSize())
+        << "Currently adding new blocks is not supported. "
+        << "All blocks must be added in one setParameter call";
+  }
+
+  VectorPtr buf = vectors_[PARAMETER_VALUE];
+  usedSegments_.reserve(offsets.size());
+  /// if offsets is empty, means parameter_block_size is too big or too many
+  /// nodes.
+  if (offsets.empty()) {
+    LOG(WARNING) << "in setParameter: offsets is empty";
+  }
+  for (size_t i = 0; i < offsets.size(); ++i) {
+    size_t blockId = blockIds[i];
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(request.blocks(i));
+    info.config = &config;
+    info.offset = offsets[i];
+    info.optimizer.reset(sgdOptimizerCreate(
+        config_, config, config.sparse_remote_update(), true /*inPserver*/));
+    if (config.sparse_remote_update()) {
+      size_t width = config.dims(1);
+      CHECK_EQ(config.parameter_block_size(), width)
+          << "block size: " << config.parameter_block_size()
+          << "width : " << width;
+    }
+    info.optimizer->init(1, info.config);
+    usedSegments_.push_back(std::make_pair(
+        offsets[i], offsets[i] + request.blocks(i).block_size()));
+  }
+  mergeSegments(&usedSegments_);
+
+  if (request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM) {
+    /// copy param from trainer
+    for (size_t i = 0; i < offsets.size(); ++i) {
+      Buffer buffer = inputBuffers[i];
+      real* start = buf->getPoint(offsets[i]);
+      CHECK_LE(offsets[i] + buffer.size, buf->getSize());
+      memcpy(start, buffer.base, sizeof(real) * buffer.size);
+    }
+  } else {
+    CHECK(request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
+    /// nothing to do, value vector zero mem already
+  }
+}
+
+void ParameterServer2::addGradient(const SendParameterRequest& request,
+                                   std::vector<Buffer>& inputBuffers,
+                                   SendParameterResponse* response,
+                                   std::vector<Buffer>* outputBuffers) {
+  VLOG(1) << "pserver: addGradient";
+
+  {
+    ReadLockGuard guard(parameterMutex_);
+    int bufferIndex = 0;
+    for (const auto& block : request.blocks()) {
+      int64_t offset = getBlockOffset(block);
+      CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                          << " id=" << block.para_id()
+                          << " block id=" << block.block_id();
+
+      int64_t blockId = getBlockId(block);
+      CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
+                           << " id=" << block.para_id()
+                           << " block id=" << block.block_id();
+
+      Buffer buffer = inputBuffers[bufferIndex];
+      ++bufferIndex;
+
+      const real* gradientBuffer = buffer.base;
+      real* gradientSumBuffer = vectors_[PARAMETER_GRADIENT]->getPoint(offset);
+
+      size_t size = buffer.size;
+
+      BlockInfo& info = blockInfos_[blockId];
+      const ParameterConfig& config = getParameterConfig(blockId);
+      if (config.sparse_remote_update()) {
+        CHECK_EQ(size, config.parameter_block_size());
+      } else {  // dense
+        CHECK_LE(size, config.parameter_block_size());
+      }
+      std::lock_guard<std::mutex> guard(*info.lock);
+      simd::addTo(gradientSumBuffer, gradientBuffer, size);
+    }
+  }
+  if (request.batch_status() == BATCH_FINISH ||
+      request.batch_status() == BATCH_START_AND_FINISH) {
+    numSamplesProcessed_ += request.num_samples();
+    cost_ += request.cost();
+    VLOG(1) << "num samples: " << numSamplesProcessed_
+            << ", new cost:" << cost_;
+
+    /// notify doOperation gradient ready
+    gradientReadyBarrier_.wait();
+
+    /// wait doOperation finish
+    parameterReadyBarrier_.wait();
+    VLOG(1) << "start send back";
+  }
+}
+
+bool ParameterServer2::asyncGrdientCommitCheckAndStat(
+    const SendParameterRequest& request) {
+  const auto trainerId = request.trainer_id();
+  int64_t trainerSteps = asyncTrainerSteps_[trainerId];
+  CHECK_GE(asyncUpdateSteps_, trainerSteps)
+      << " async update steps overflows "
+      << " trainer id: " << trainerId
+      << " async update steps in pserver: " << asyncUpdateSteps_
+      << " async update steps in request: " << trainerSteps;
+
+  asyncUpdateSteps_++;
+  bool commitGradient = true;
+
+  int64_t delta = asyncUpdateSteps_ - trainerSteps;
+  if (delta >= asyncLaggedThreshold_) {
+    VLOG(1) << "discard Async Update: "
+            << " trainer id: " << trainerId
+            << " pserver steps: " << asyncUpdateSteps_
+            << " request steps: " << trainerSteps;
+    asyncLaggedGradientsNum_++;
+    commitGradient = false;
+  }
+  /// stat on lagged steps, to get total discard distribution
+  if (static_cast<size_t>(delta) < asyncUpdateStat_.size()) {
+    asyncUpdateStat_[delta]++;
+  } else {
+    asyncUpdateStat_[asyncUpdateStat_.size() - 1]++;
+  }
+  /// stat on trainerId and discard, to get trainer condition
+  if (commitGradient) {
+    asyncTrainerCommitStat_[trainerId]++;
+  } else {
+    asyncTrainerDiscardStat_[trainerId]++;
+  }
+
+  return commitGradient;
+}
+
+static ThreadLocal<std::vector<bool>> localBlockBitset_;
+
+void ParameterServer2::asyncSGD(const SendParameterRequest& request,
+                                std::vector<Buffer>& inputBuffers,
+                                SendParameterResponse* response,
+                                std::vector<Buffer>* outputBuffers) {
+  int64_t numBlocks = blockIdMap_.size();
+  auto& localBlockBitset = *localBlockBitset_;
+
+  if (isSparseServer_) {
+    if (localBlockBitset.empty()) {
+      localBlockBitset.resize(numBlocks);
+    }
+    localBlockBitset.assign(numBlocks, false);
+  }
+
+  ReadLockGuard guard(parameterMutex_);
+
+  if (request.send_back_parameter()) {
+    outputBuffers->reserve(request.blocks_size());
+  }
+
+  bool commitGradient = asyncGrdientCommitCheckAndStat(request);
+
+  VectorPtr* vecs = parameter::getThreadLocalBuffer();
+  size_t bufferIndex = 0;
+  for (const auto& block : request.blocks()) {
+    int64_t offset = getBlockOffset(block);
+    CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                        << " id=" << block.para_id()
+                        << " block id=" << block.block_id();
+    int64_t blockId = getBlockId(block);
+    CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
+                         << " id=" << block.para_id()
+                         << " block id=" << block.block_id();
+    Buffer buffer = inputBuffers[bufferIndex];
+    ++bufferIndex;
+
+    size_t size = buffer.size;
+
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+
+    std::lock_guard<std::mutex> guard(*info.lock);
+    /// gradients are too obsolete, will be discarded
+    if (commitGradient) {
+      info.optimizer->startBatch(numSamplesProcessed_);
+
+      for (const auto type : info.optimizer->getParameterTypes()) {
+        vecs[type]->subVecFrom(*vectors_[type], offset, size);
+      }
+      vecs[PARAMETER_GRADIENT]->subVecFrom(buffer.base, 0, size);
+      info.optimizer->update(vecs, config, isSparseServer_ ? 0 : -1);
+
+      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
+        blockTraverse(info, config, offset, size, vecs, callback);
+      }
+      info.optimizer->finishBatch();
+    }
+
+    if (commitGradient && isSparseServer_) {
+      localBlockBitset[blockId] = true;
+    }
+
+    if (!isSparseServer_ && request.send_back_parameter()) {  // dense
+      int type = request.send_back_parameter_type();
+      sendBackParameter(block, type, response, &buffer, outputBuffers);
+    }
+  }  /// foreach block
+
+  asyncTrainerSteps_[request.trainer_id()] = asyncUpdateSteps_;
+
+  if (commitGradient && isSparseServer_) {
+    /// find blocks that trainer do not request update
+    for (int64_t blockId = 0; blockId < numBlocks; ++blockId) {
+      if (localBlockBitset[blockId]) {
+        continue;
+      }
+
+      BlockInfo& info = blockInfos_[blockId];
+      const ParameterConfig& config = *info.config;
+      size_t size = config.parameter_block_size();
+
+      std::lock_guard<std::mutex> guard(*info.lock);
+      info.optimizer->startBatch(numSamplesProcessed_);
+      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
+        blockTraverse(info, config, info.offset, size, vecs, callback);
+      }
+      info.optimizer->finishBatch();
+    }
+  }
+
+  if (commitGradient && (request.batch_status() == BATCH_FINISH ||
+                         request.batch_status() == BATCH_START_AND_FINISH)) {
+    numSamplesProcessed_ += request.num_samples();
+  }
+
+  /// show some performance log if needed
+  if (request.trainer_id() == 0) {
+    /// batchId_ is approximately equal to "real batchId_"
+    batchId_++;
+  }
+}
+
+void ParameterServer2::getParameter(const SendParameterRequest& request,
+                                    std::vector<Buffer>& inputBuffers,
+                                    SendParameterResponse* response,
+                                    std::vector<Buffer>* outputBuffers) {
+  (void)inputBuffers;
+  LOG(INFO) << "pserver: getParameter";
+  ReadLockGuard guard(parameterMutex_);
+  for (const auto& block : request.blocks()) {
+    int type = request.send_back_parameter_type();
+    sendBackParameter(block, type, response, outputBuffers);
+  }
+}
+
+void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
+                                          std::vector<Buffer>& inputBuffers,
+                                          SendParameterResponse* response,
+                                          std::vector<Buffer>* outputBuffers) {
+  (void)inputBuffers;
+  auto& buffer = *readWriteBuffer_;
+  size_t numReals = 0;
+  for (const auto& block : request.blocks()) {
+    numReals += getParameterConfig(block).dims(1);
+  }
+  buffer.resize(numReals);
+
+  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
+
+  ReadLockGuard guard(parameterMutex_);
+  size_t offset = 0;
+  for (const auto& block : request.blocks()) {
+    size_t width = getParameterConfig(block).dims(1);
+    Buffer buf = {buffer.data() + offset, width};
+    int type = request.send_back_parameter_type();
+    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
+    offset += width;
+  }
+}
+
+void ParameterServer2::sendBackParameter(const ParameterBlock& block,
+                                         int parameterType,
+                                         SendParameterResponse* response,
+                                         std::vector<Buffer>* outputBuffers) {
+  ParameterBlock* returnBlock = response->add_blocks();
+  returnBlock->set_para_id(block.para_id());
+  returnBlock->set_block_id(block.block_id());
+  returnBlock->set_begin_pos(block.begin_pos());
+  returnBlock->set_block_size(block.block_size());
+
+  int64_t offset = getBlockOffset(block);
+  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
+
+  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
+  outputBuffers->push_back({valueBuffer, (size_t)block.block_size()});
+}
+
+void ParameterServer2::sendBackParameter(const ParameterBlock& block,
+                                         int parameterType,
+                                         SendParameterResponse* response,
+                                         Buffer* buffer,
+                                         std::vector<Buffer>* outputBuffers) {
+  ParameterBlock* returnBlock = response->add_blocks();
+  returnBlock->set_para_id(block.para_id());
+  returnBlock->set_block_id(block.block_id());
+  returnBlock->set_begin_pos(block.begin_pos());
+  returnBlock->set_block_size(block.block_size());
+
+  int64_t offset = getBlockOffset(block);
+  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
+
+  size_t size = buffer->size;
+  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
+  /// copy to second buffer to avoid to be polluted by other request
+  memcpy(buffer->base, valueBuffer, sizeof(real) * size);
+  outputBuffers->push_back({buffer->base, size});
+}
+
+void ParameterServer2::sendBackParameterSparse(
+    const ParameterBlock& block,
+    int parameterType,
+    SendParameterResponse* response,
+    Buffer* buffer,
+    size_t width,
+    std::vector<Buffer>* outputBuffers) {
+  ParameterBlock* returnBlock = response->add_blocks();
+  returnBlock->set_para_id(block.para_id());
+  returnBlock->set_block_id(block.block_id());
+  returnBlock->set_begin_pos(block.begin_pos());
+  returnBlock->set_block_size(block.block_size());
+  int64_t offset = getBlockOffset(block);
+  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
+
+  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
+  CHECK_EQ(buffer->size, width);
+  memcpy(buffer->base, valueBuffer, width * sizeof(real));
+  outputBuffers->push_back(*buffer);
+}
+
+void ParameterServer2::readAllBlocks(
+    MsgReader* msgReader, std::vector<ParameterServer2::Buffer>* buffers) {
+  auto& buffer = *readWriteBuffer_;
+  size_t numBlocks = msgReader->getNumBlocks();
+  buffer.resizeWithAlignHints(msgReader->getTotalLength() / sizeof(real),
+                              numBlocks);
+  std::vector<void*> bufs(numBlocks);
+  buffers->clear();
+  buffers->reserve(numBlocks);
+  buffer.resetAlignAlloc();
+  for (size_t i = 0; i < numBlocks; ++i) {
+    size_t len = msgReader->getBlockLength(i);
+    CHECK_EQ(len % sizeof(real), (size_t)0);
+    size_t size = len / sizeof(real);
+    bufs[i] = buffer.nextBlock(size);
+    buffers->push_back({(real*)bufs[i], size});
+  }
+  msgReader->readBlocks(bufs);
+}
+
+void ParameterServer2::sendParameter(const SendParameterRequest& request,
+                                     std::unique_ptr<MsgReader> msgReader,
+                                     ProtoResponseCallbackEx callback) {
+  SendParameterResponse response;
+  std::vector<Buffer> inputBuffers;
+  std::vector<Buffer> outputBuffers;
+  readAllBlocks(msgReader.get(), &inputBuffers);
+  msgReader.reset();
+
+  switch (request.update_mode()) {
+    case PSERVER_UPDATE_MODE_SET_PARAM:
+    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
+      setParameter(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_GET_PARAM:
+      getParameter(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
+      getParameterSparse(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_ASYNC_SGD:
+      asyncSGD(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
+      addGradient(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
+      break;
+  }
+  switch (request.update_mode()) {
+    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
+      (*requestVec_).push_back(request);
+      (*callbackVec_).push_back(callback);
+      if (request.batch_status() == BATCH_FINISH ||
+          request.batch_status() == BATCH_START_AND_FINISH) {
+        for (size_t i = 0; i < (*requestVec_).size(); i++) {
+          ReadLockGuard guard(parameterMutex_);
+          SendParameterRequest& request = (*requestVec_)[i];
+          SendParameterResponse responseTemp;
+
+          std::vector<iovec> outputIovs;
+          if (request.send_back_parameter()) {
+            CHECK(!isSparseServer_);
+            std::vector<Buffer> outputBuffersTemp;
+            for (const auto& block : request.blocks()) {
+              int type = request.send_back_parameter_type();
+              sendBackParameter(block, type, &responseTemp, &outputBuffersTemp);
+            }
+            outputIovs.reserve(outputBuffersTemp.size());
+            for (auto buffer : outputBuffersTemp) {
+              outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
+            }
+          }
+
+          ProtoResponseCallbackEx& callbackTemp = (*callbackVec_)[i];
+          callbackTemp(responseTemp, outputIovs);
+        }
+        (*requestVec_).clear();
+        (*callbackVec_).clear();
+      }
+      break;
+    case PSERVER_UPDATE_MODE_SET_PARAM:
+    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
+    case PSERVER_UPDATE_MODE_GET_PARAM:
+    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
+    case PSERVER_UPDATE_MODE_ASYNC_SGD:
+    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
+      std::vector<iovec> outputIovs;
+      outputIovs.reserve(outputBuffers.size());
+      for (auto buffer : outputBuffers) {
+        outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
+      }
+      callback(response, outputIovs);
+      break;
+  }
+}
+
+template <typename Dtype>
+void ParameterServer2::reduceAndSendData(const SendDataRequest& request,
+                                         std::unique_ptr<MsgReader>& msgReader,
+                                         ProtoResponseCallbackEx& callback) {
+  SendDataResponse response;
+  response.set_type(request.type());
+  response.set_server_id(serverId_);
+
+  auto sendData = reinterpret_cast<Dtype*>(dataMems_[0].get()->getBuf());
+  size_t rawMemSize = dataMems_[0].get()->getSize();
+  CHECK_EQ(rawMemSize % sizeof(Dtype), 0U);
+  size_t dataMemSize = rawMemSize / sizeof(Dtype);
+  for (size_t i = 1; i < dataMems_.size(); ++i) {
+    CHECK_EQ(dataMems_[i].get()->getSize(), rawMemSize);
+    auto data = reinterpret_cast<Dtype*>(dataMems_[i].get()->getBuf());
+    for (size_t j = 0; j < dataMemSize; ++j) {
+      sendData[j] += data[j];
+    }
+  }
+  std::vector<iovec> outputIovs;
+  auto block = response.add_blocks();
+  outputIovs.push_back({sendData, rawMemSize});
+  block->set_total_size(rawMemSize);
+  block->set_data_size(sizeof(Dtype));
+  callback(response, outputIovs);
+}
+
+void ParameterServer2::templateReduceSum(const SendDataRequest& request,
+                                         std::unique_ptr<MsgReader>& msgReader,
+                                         ProtoResponseCallbackEx& callback) {
+  const auto& block = request.blocks(0);
+  switch (block.data_type()) {
+    case TRANS_FLOAT:
+      reduceAndSendData<float>(request, msgReader, callback);
+      break;
+    case TRANS_DOUBLE:
+      reduceAndSendData<double>(request, msgReader, callback);
+      break;
+    case TRANS_INT32:
+      reduceAndSendData<int>(request, msgReader, callback);
+      break;
+    case TRANS_UINT32_T:
+      reduceAndSendData<uint32_t>(request, msgReader, callback);
+      break;
+    case TRANS_INT64_T:
+      reduceAndSendData<int64_t>(request, msgReader, callback);
+      break;
+    case TRANS_UINT64_T:
+      reduceAndSendData<uint64_t>(request, msgReader, callback);
+      break;
+    default:
+      LOG(FATAL) << "not supported";
+      break;
+  }
+}
+
+void ParameterServer2::sendData(const SendDataRequest& request,
+                                std::unique_ptr<MsgReader> msgReader,
+                                ProtoResponseCallbackEx callback) {
+  SendDataResponse response;
+  response.set_type(request.type());
+  response.set_server_id(serverId_);
+
+  switch (request.update_mode()) {
+    case DATA_UPDATE_MODE_SET_OWN: {
+      CHECK_EQ(msgReader->getNumBlocks(), (size_t)(request.blocks_size()));
+      size_t totalLen = msgReader->getTotalLength();
+      if (totalLen > 0) {
+        CHECK_EQ(msgReader->getNumBlocks(), 1U)
+            << "Only one block currently support now!";
+        const auto& block = request.blocks(0);
+        if (0 == dataSize_) {
+          dataSize_ = block.data_size();
+        } else {
+          CHECK_EQ(dataSize_, block.data_size());
+        }
+        int64_t serverId = request.server_id();
+        if (serverId_ < 0) {
+          serverId_ = serverId;
+        } else {
+          CHECK_EQ(serverId_, serverId);
+        }
+        int64_t clientId = request.client_id();
+        dataMems_[clientId] = std::make_shared<CpuMemoryHandle>(totalLen);
+        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
+        msgReader->readNextBlock(dataMems_[clientId].get()->getBuf());
+      }
+      msgReader.reset();
+      std::vector<iovec> outputIovs;
+      callback(response, outputIovs);
+      break;
+    }
+    case DATA_UPDATE_MODE_GET_ALL: {
+      /// Currently only support DATA_REDUCE_SUM
+      /// And their Operations are just add
+      CHECK(DATA_REDUCE_SUM == request.type());
+      templateReduceSum(request, msgReader, callback);
+      break;
+    }
+    default: { LOG(FATAL) << "not supported"; }
+  }
+}
+
+void ParameterServer2::clearUnusedSegments(CpuVector* vec) {
+  real* data = vec->getData();
+  if (usedSegments_.empty()) {
+    return;
+  }
+  memset(data, 0, sizeof(real) * usedSegments_[0].first);
+  memset(data + usedSegments_.back().second,
+         0,
+         sizeof(real) * (size_ - usedSegments_.back().second));
+  size_t n = size_ - usedSegments_.back().second;
+
+  for (size_t i = 1; i < usedSegments_.size(); ++i) {
+    memset(
+        data + usedSegments_[i - 1].second,
+        0,
+        sizeof(real) * (usedSegments_[i].first - usedSegments_[i - 1].second));
+    n += usedSegments_[i].first - usedSegments_[i - 1].second;
+  }
+}
+
+void ParameterServer2::parallelExecForEachBlock(ExecFunc func) {
+  SyncThreadPool::execHelper(
+      syncThreadPool_.get(), [&](int tid, size_t numThreads) {
+        int64_t numBlocks = blockIdMap_.size();
+        VectorPtr* vecs = parameter::getThreadLocalBuffer();
+        for (int64_t blockId = tid; blockId < numBlocks;
+             blockId += numThreads) {
+          func(blockId, vecs);
+        }
+      });
+}
+
+void ParameterServer2::blockTraverse(
+    BlockInfo& info,
+    const ParameterConfig& config,
+    int64_t offset,
+    size_t size,
+    const VectorPtr vecs[],
+    const ParameterOptimizer::TraverseCallback& callback) {
+  /// setup sub bufs
+  for (const auto type : info.optimizer->getParameterTypes()) {
+    vecs[type]->subVecFrom(*vectors_[type], offset, size);
+  }
+  callback(vecs, config, config.sparse_remote_update() ? 0 : -1LU);
+}
+
+void ParameterServer2::op_SGD(const Operation& operation,
+                              OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  if (allClientPassFinish_) {
+    /// when all clients signal pass finished, the update
+    /// is empty.
+    return;
+  }
+
+  {
+    parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+      BlockInfo& info = blockInfos_[blockId];
+      const ParameterConfig& config = getParameterConfig(blockId);
+      int64_t offset = info.offset;
+      size_t size = config.parameter_block_size();
+
+      info.optimizer->startBatch(numSamplesProcessed_);
+
+      for (const auto type : info.optimizer->getParameterTypes()) {
+        vecs[type]->subVecFrom(*vectors_[type], offset, size);
+      }
+      info.optimizer->update(
+          vecs, config, config.sparse_remote_update() ? 0 : -1LU);
+      vecs[PARAMETER_GRADIENT]->zeroMem();
+
+      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
+        blockTraverse(info, config, offset, size, vecs, callback);
+      }
+      info.optimizer->finishBatch();
+    });
+  }
+
+  batchId_++;
+}
+
+void ParameterServer2::op_start_pass(const Operation& operation,
+                                     OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    info.optimizer->startPass();
+  });
+}
+
+void ParameterServer2::op_finish_pass(const Operation& operation,
+                                      OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+    size_t size = config.parameter_block_size();
+
+    /// catch up with
+    if (auto callback = info.optimizer->startCatchUpWith()) {
+      blockTraverse(info, config, info.offset, size, vecs, callback);
+      info.optimizer->finishCatchUpWith();
+    }
+
+    /// finish pass
+    info.optimizer->finishPass();
+  });
+  batchId_ = 0;
+}
+
+void ParameterServer2::op_apply(const Operation& operation,
+                                OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+    int64_t offset = info.offset;
+    size_t size = config.parameter_block_size();
+
+    // catch up with
+    if (auto callback = info.optimizer->startCatchUpWith()) {
+      blockTraverse(info, config, offset, size, vecs, callback);
+      info.optimizer->finishCatchUpWith();
+    }
+
+    // apply to PARAMETER_APPLY
+    if (auto callback = info.optimizer->apply()) {
+      blockTraverse(info, config, offset, size, vecs, callback);
+    }
+  });
+}
+
+void ParameterServer2::op_randomize(const Operation& operation,
+                                    OperationResult* result) {
+  LOG(INFO) << "ParameterServer2::op_randomize: serverId=" << serverId_;
+
+  CpuVector& valueVec = *vectors_[PARAMETER_VALUE];
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+    size_t size = config.parameter_block_size();
+
+    vecs[PARAMETER_VALUE]->subVecFrom(valueVec, info.offset, size);
+    Parameter::randomize(vecs[PARAMETER_VALUE], config);
+  });
+}
+
+void ParameterServer2::loadValueVector(const LoadValueRequest& request,
+                                       ProtoResponseCallback callback) {
+  LoadValueResponse response;
+  LOG(INFO) << "ParameterServer2::loadValueVector: serverId=" << serverId_;
+
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
+  std::string filename = request.dir_name() + buf;
+
+  std::ifstream fs(filename, std::ios_base::binary);
+  CHECK(fs) << "Fail to open " << filename;
+
+  CpuVector& vec = *vectors_[PARAMETER_VALUE];
+  Parameter::Header header;
+  CHECK(fs.read(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to read parameters in pserver";
+  CHECK(Parameter::isHeaderFormatSupported(header.format))
+      << "Incorrect format version: " << header.format;
+  CHECK_EQ(header.size, (size_t)size_)
+      << "The size (" << header.size << ") in the file does not match the size "
+      << "(" << size_ << ") of the pserver: " << serverId_;
+  CHECK_EQ(header.valueSize, sizeof(real)) << "Unsupported valueSize "
+                                           << header.valueSize;
+  CHECK(fs.read(reinterpret_cast<char*>(vec.getData()),
+                header.size * sizeof(real)));
+
+  callback(response);
+}
+
+void ParameterServer2::saveValueVector(const SaveValueRequest& request,
+                                       ProtoResponseCallback callback) {
+  SaveValueResponse response;
+  LOG(INFO) << "ParameterServer2::SaveValueVector: serverId=" << serverId_;
+
+  mkDir(request.dir_name().c_str());
+
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
+  std::string filename = request.dir_name() + buf;
+
+  std::ofstream fs(filename, std::ios_base::binary);
+  CHECK(fs) << "Fail to open " << filename;
+
+  CpuVector& vec = vectors_[PARAMETER_APPLY] ? *vectors_[PARAMETER_APPLY]
+                                             : *vectors_[PARAMETER_VALUE];
+  Parameter::Header header;
+  // TODO(TJ): save param headerFormat_
+  header.format = PARAM_FORMAT_ORIGINAL;
+  header.valueSize = sizeof(real);
+  header.size = size_;
+
+  CHECK_EQ(header.size, vec.getSize());
+
+  CHECK(fs.write(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to write parameter in pserver: " << serverId_;
+
+  CHECK(fs.write(reinterpret_cast<char*>(vec.getData()),
+                 header.size * sizeof(real)))
+      << "Fail to write parameter in pserver: " << serverId_;
+
+  callback(response);
+}
+
+void ParameterServer2::op_RESET(const Operation& operation,
+                                OperationResult* result) {
+  (void)result;
+  CpuVector* u = vectors_[operation.pvectors(0)].get();
+  u->reset(operation.scalars(0));
+  clearUnusedSegments(u);
+}
+
+void ParameterServer2::op_utv(const Operation& operation,
+                              OperationResult* result) {
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  double sum = 0;
+  for (int64_t i = 0; i < size; ++i) {
+    sum += (double)u[i] * (double)v[i];
+  }
+  result->add_scalars(sum);
+}
+
+void ParameterServer2::op_au_bv(const Operation& operation,
+                                OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  real a = operation.scalars(0);
+  real b = operation.scalars(1);
+  for (int64_t i = 0; i < size; ++i) {
+    v[i] = a * u[i] + b * v[i];
+  }
+}
+
+void ParameterServer2::op_COPY(const Operation& operation,
+                               OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  for (int64_t i = 0; i < size; ++i) {
+    v[i] = u[i];
+  }
+}
+
+void ParameterServer2::op_au(const Operation& operation,
+                             OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  int64_t size = size_;
+  real a = operation.scalars(0);
+  for (int64_t i = 0; i < size; ++i) {
+    u[i] *= a;
+  }
+}
+
+void ParameterServer2::op_au_bv_cw(const Operation& operation,
+                                   OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  real* w = vectors_[operation.pvectors(2)]->getData();
+  int64_t size = size_;
+  real a = operation.scalars(0);
+  real b = operation.scalars(1);
+  real c = operation.scalars(2);
+  for (int64_t i = 0; i < size; ++i) {
+    w[i] = a * u[i] + b * v[i] + c * w[i];
+  }
+}
+
+void ParameterServer2::op_make_steepest_desc_dir(const Operation& operation,
+                                                 OperationResult* result) {
+  (void)result;
+  real* dir = vectors_[operation.pvectors(0)]->getData();
+  real* grad = vectors_[operation.pvectors(1)]->getData();
+  real* x = vectors_[operation.pvectors(2)]->getData();
+  int64_t size = size_;
+  real l1weight = operation.scalars(0);
+  for (int64_t i = 0; i < size; ++i) {
+    if (x[i] < 0) {
+      dir[i] = -grad[i] + l1weight;
+    } else if (x[i] > 0) {
+      dir[i] = -grad[i] - l1weight;
+    } else {
+      if (grad[i] < -l1weight) {
+        dir[i] = -grad[i] - l1weight;
+      } else if (grad[i] > l1weight) {
+        dir[i] = -grad[i] + l1weight;
+      } else {
+        dir[i] = 0;
+      }
+    }
+  }
+}
+
+void ParameterServer2::op_fix_dir_signs(const Operation& operation,
+                                        OperationResult* result) {
+  (void)result;
+  real* dir = vectors_[operation.pvectors(0)]->getData();
+  real* steepestDescDir = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  for (int64_t i = 0; i < size; ++i) {
+    if (dir[i] * steepestDescDir[i] <= 0) {
+      dir[i] = 0;
+    }
+  }
+}
+
+void ParameterServer2::op_fix_omega_signs(const Operation& operation,
+                                          OperationResult* result) {
+  (void)result;
+  real* x = vectors_[operation.pvectors(0)]->getData();
+  real* newx = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  for (int64_t i = 0; i < size; ++i) {
+    if (x[i] * newx[i] < 0) {
+      newx[i] = 0;
+    }
+  }
+}
+
+void ParameterServer2::op_dir_deriv(const Operation& operation,
+                                    OperationResult* result) {
+  real* dir = vectors_[operation.pvectors(0)]->getData();
+  real* grad = vectors_[operation.pvectors(1)]->getData();
+  real* x = vectors_[operation.pvectors(2)]->getData();
+  int64_t size = size_;
+  real l1weight = operation.scalars(0);
+  double sum = 0;
+  for (int64_t i = 0; i < size; ++i) {
+    if (dir[i] != 0) {
+      if (x[i] < 0) {
+        sum += dir[i] * (grad[i] - l1weight);
+      } else if (x[i] > 0) {
+        sum += dir[i] * (grad[i] + l1weight);
+      } else if (dir[i] < 0) {
+        sum += dir[i] * (grad[i] - l1weight);
+      } else if (dir[i] > 0) {
+        sum += dir[i] * (grad[i] + l1weight);
+      }
+    }
+  }
+  result->add_scalars(sum);
+}
+
+void ParameterServer2::op_cost(const Operation& operation,
+                               OperationResult* result) {
+  real* x = vectors_[operation.pvectors(0)]->getData();
+  real* newgrad = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  real l1weight = operation.scalars(0);
+  real l2weight = operation.scalars(1);
+  double cost_real = cost_ / mpiSize_;
+  double sum_weight_l1 = 0;
+  double sum_weight_l2 = 0;
+  for (int64_t i = 0; i < size; ++i) {
+    sum_weight_l1 += std::abs(x[i]);
+    sum_weight_l2 += x[i] * x[i];
+    newgrad[i] += 2.0 * l2weight * x[i];
+  }
+  cost_real += l1weight * sum_weight_l1 + l2weight * sum_weight_l2;
+  result->add_scalars(cost_real);
+}
+
+ParameterServer2::OperatorFunction ParameterServer2::opFuncs[] = {
+    nullptr,                         // PSERVER_OP_utu = 0;
+    &ParameterServer2::op_utv,       // PSERVER_OP_utv = 1;
+    &ParameterServer2::op_au,        // PSERVER_OP_au = 2;
+    &ParameterServer2::op_au_bv,     // PSERVER_OP_au_bv = 3;
+    nullptr,                         // PSERVER_OP_aAx_bu = 4;
+    &ParameterServer2::op_SGD,       // PSERVER_OP_SGD = 5;
+    &ParameterServer2::op_RESET,     // PSERVER_OP_RESET = 6;
+    &ParameterServer2::op_COPY,      // PSERVER_OP_COPY = 7;
+    &ParameterServer2::op_au_bv_cw,  // PSERVER_OP_au_bv_cw = 8;
+    &ParameterServer2::op_make_steepest_desc_dir,
+    /// PSERVER_OP_MAKE_STEEPEST_DESC_DIR = 9;
+    &ParameterServer2::op_fix_dir_signs,    // PSERVER_OP_FIX_SIGNS = 10;
+    &ParameterServer2::op_dir_deriv,        // PSERVER_OP_DIR_DERIV = 11;
+    &ParameterServer2::op_fix_omega_signs,  // PSERVER_OP_FIX_OMEGA_SIGNS = 12;
+    &ParameterServer2::op_cost,             // PSERVER_OP_COST = 13
+    &ParameterServer2::op_start_pass,       // PSERVER_OP_START_PASS = 14
+    &ParameterServer2::op_finish_pass,      // PSERVER_OP_FINISH_PASS = 15
+    &ParameterServer2::op_randomize,        // PSERVER_OP_RANDOMIZE = 16
+    &ParameterServer2::op_apply,            // PSERVER_OP_APPLY = 17
+};
+
+void ParameterServer2::doOperation(const DoOperationRequest& request,
+                                   ProtoResponseCallback callback) {
+  if (request.wait_for_gradient()) {
+    /// wait gradient update
+    gradientReadyBarrier_.wait();
+    allClientPassFinish_ = numPassFinishClients_ == FLAGS_num_gradient_servers;
+  }
+
+  DoOperationResponse response;
+  response.set_pass_finish(allClientPassFinish_);
+
+  for (const auto& op : request.operations()) {
+    OperationResult* opResult = response.add_results();
+    if (op.operation() >= ARRAYSIZE(opFuncs)) {
+      LOG(ERROR) << "Unknown operation " << op.operation();
+      response.set_return_message(kRetMsgUnknownOperation);
+    }
+    OperatorFunction opFunc = opFuncs[op.operation()];
+    if (!opFunc) {
+      LOG(ERROR) << "Operation not implemented: " << op.operation();
+      response.set_return_message(kRetMsgUnknownOperation);
+    }
+    (this->*opFunc)(op, opResult);
+  }
+
+  if (request.send_back_parameter()) {
+    /// clean current cost
+    cost_ = 0;
+
+    if (allClientPassFinish_ && request.release_pass()) {
+      /// This signals that all clients finish one pass, so waitPassFinish()
+      /// will stop waiting.
+      numPassFinishClients_ = 0;
+    }
+
+    /// notify addGradient() to send back parameter
+    parameterReadyBarrier_.wait();
+  }
+  callback(response);
+}
+
+void ParameterServer2::waitPassStart(const WaitPassStartRequest& request,
+                                     ProtoResponseCallback callback) {
+  passBarrier_.wait();
+  callback(WaitPassStartResponse());
+}
+
+void ParameterServer2::waitPassFinish(const WaitPassFinishRequest& request,
+                                      ProtoResponseCallback callback) {
+  numPassFinishClients_ += 1;
+
+  while (numPassFinishClients_ != 0) {
+    /// notify doOperation gradient ready
+    gradientReadyBarrier_.wait();
+    /// wait doOperation finish
+    parameterReadyBarrier_.wait();
+  }
+
+  callback(WaitPassFinishResponse());
+}
+
+void ParameterServer2::synchronize(const SynchronizeRequest& request,
+                                   ProtoResponseCallback callback) {
+  synchronizeBarriers_[request.sync_object_id()]->wait();
+  dataSize_ = 0;
+  callback(SynchronizeResponse());
+}
+
+void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
+                                       ProtoResponseCallback callback) {
+  synchronizeBarriers_[request.sync_object_id()]->wait();
+  callback(SynchronizeResponse());
+
+  if (request.trainer_id() == 0) {
+    batchId_ = 0;
+  }
+}
+
+void ParameterServer2::createVector(const CreateVectorRequest& request,
+                                    ProtoResponseCallback callback) {
+  (void)request;
+  CreateVectorResponse response;
+  LOG(INFO) << "ParameterServer2::createVector: size=" << size_;
+  CpuVectorPtr vec = std::make_shared<CpuVector>(size_);
+  int64_t handle = -1;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    handle = vectors_.size();
+    vectors_.push_back(vec);
+  }
+  response.set_handle(handle);
+  callback(response);
+}
+
+void ParameterServer2::releaseVector(const ReleaseVectorRequest& request,
+                                     ProtoResponseCallback callback) {
+  ReleaseVectorResponse response;
+  CpuVectorPtr vec;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    vec.swap(vectors_[request.handle()]);
+  }
+  callback(response);
+}
+
+void ParameterServer2::createMatrix(const CreateMatrixRequest& request,
+                                    ProtoResponseCallback callback) {
+  CreateMatrixResponse response;
+  /// We need to create column major matrix of size_ * num_cols
+  /// Matrix is row majoar. Need to tranpose when use it.
+  CpuMatrixPtr mat = std::make_shared<CpuMatrix>(request.num_cols(), size_);
+  int64_t handle = -1;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    handle = matrices_.size();
+    matrices_.push_back(mat);
+  }
+  response.set_handle(handle);
+  callback(response);
+}
+
+void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
+                                     ProtoResponseCallback callback) {
+  ReleaseMatrixResponse response;
+  CpuMatrixPtr mat;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    mat.swap(matrices_[request.handle()]);
+  }
+  callback(response);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterServer2.h b/paddle/legacy/pserver/ParameterServer2.h
new file mode 100644
index 0000000000000000000000000000000000000000..040699878d4d3c2f7effa2b6394a663441512dda
--- /dev/null
+++ b/paddle/legacy/pserver/ParameterServer2.h
@@ -0,0 +1,696 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+#include <limits>
+#include <mutex>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/ParameterOptimizer.h"
+#include "paddle/utils/Common.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/ThreadLocal.h"
+
+#include "ParameterService.pb.h"
+
+#include "ProtoServer.h"
+
+DECLARE_int32(port);
+
+namespace paddle {
+
+// @TODO(yanfei):
+// if armed with high density computation resource per node, pserver could also
+// utilize GPU to reduce overhead. if this mechanism is used, it could pipeline
+// network receiving and GPU computation to reduce the network overhead even
+// further. the pipeline could help to accelerate BIG model training.
+// @TODO:(yanfei)
+// for cpu and less/low gpu machine, the time exhausted by forward and backward
+// could be larger than optimization at pserver. However, if armed with lots of
+// gpus per node and if the model size is so large enough that limited cpu
+// computation causes big optmization latency, the GPU may be required by
+// pserver.
+
+/**
+ * Client interface for the parameter server
+ *
+ * it implements several rpc API for remote parameter client usage.
+ * for sync-sgd, client needs one controller thread to build connections
+ * to all pservers, these controller connections do barriers
+ * synchronization with these connections used for transfering data.
+ * each data connection uses block based fine grained synchronization
+ * to gain better scalability. Merging gradients from different trainers
+ * are concurrently executed with block units, so that some network
+ * overhead will be hidden in merging gradient.
+ * for async-sgd, the difference is that pserver will do optimization
+ * immediately if the gradients are ready, so that pserver needs to
+ * prepare separate buffer to store value for sending back to trainer
+ * to prevent from being polluted.
+ */
+class ParameterServer2 : public ProtoServer {
+ protected:
+  /// parameter_ mutex.
+  RWLock parameterMutex_;
+
+  typedef std::pair<size_t, int64_t> BlockKey;
+  struct BlockKeyHash {
+    size_t operator()(const BlockKey& key) const {
+      return std::hash<size_t>()(key.first) + key.second;
+    }
+  };
+
+  // TODO(yanfei):
+  // if index data structure is based on parameters instead of blocks, the
+  // lookup performance could be better. In addition, the block memory
+  // access almost exhibits good locality, so index data structure and
+  // block data structure can be refined further, especially if gpu is used
+  // for pserver.
+  /**
+   * all parameters are stored in CpuVector with a blockMap_ data structure
+   * to index block data required by requests.
+   */
+  typedef std::unordered_map<BlockKey, int64_t, BlockKeyHash> BlockMap;
+  /// <(para, block), global offset(byte) in all parameters>
+  BlockMap blockOffsetMap_;
+  /// <(para, block), global idx [0, nBlocksInAllParameters]>
+  BlockMap blockIdMap_;
+
+  std::vector<CpuVectorPtr> vectors_;
+  std::vector<CpuMatrixPtr> matrices_;
+  std::vector<CpuMemHandlePtr> dataMems_;
+
+  // TODO(yanfei):
+  // if storing sparse_remote_update() flag in request instead of
+  // reading configMap_, and storing config within new block wise
+  // overview data structure, the config mapping, block mapping
+  // can be unified in single clean data structure. Use para_id
+  // to index parameters, use offset to index block within parameter
+  // and keep two index into single one.
+  /**
+   * mapping between parameter and config
+   * different parameter allows different config, such as decay_rate.
+   * for each request, it need to read config for adding gradient
+   * and optmization.
+   */
+  std::unordered_map<size_t, ParameterConfig> configMap_;
+
+  /**
+   * to parallelize the multi-thread and multi-connnection
+   * computation at pserver, it use block unit to reduce
+   * the contention for computation, even further use block
+   * level optimizater control for each block for some special
+   * reason annotated below.
+   */
+  struct BlockInfo {
+    const ParameterConfig* config;
+    std::unique_ptr<std::mutex> lock;
+    /// global offset for all parameters
+    uint64_t offset;
+    /**
+     *
+     * Async sgd in pserver is very different from sync sgd.
+     * Each trainer follows startBatch, update*, finishBatch as in
+     * sync sgd, but all these actions are almost executed by
+     * multi-core and multi-thread simutaneously, so that async
+     * sgd optimization is based on block level in reality, then
+     * per block optimization is necessary indeed. In addition,
+     * per block optimization is also perfered for performance
+     * with multithreads.
+     */
+    std::unique_ptr<ParameterOptimizer> optimizer;
+  };
+  std::vector<BlockInfo> blockInfos_;
+
+  typedef std::vector<std::pair<int64_t, int64_t>> BlockSegments;
+  /// Because some blocks might not be fully used. We keep a
+  /// record of which segments are used.
+  BlockSegments usedSegments_;
+
+  /// record pserver status, all status defined in ParameterService.pb
+  PServerStatus status_;
+  /// record all samples processed which could be used by optimizater
+  std::atomic<int64_t> numSamplesProcessed_;
+  double cost_;
+  int mpiSize_;
+  int dataSize_;
+  /// configuration for current parameter optimizer
+  OptimizationConfig config_;
+
+  /**
+   * The ReadWriteBuffer is based on std::vector, but aligned for avx/sse
+   * compute. And add some helper method to allocate memory aligned blocks.
+   *
+   * @param T          type of element.
+   * @param AlignBytes the memory aligned bytes for allocated blocks.
+   */
+  template <typename T, size_t AlignBytes>
+  class ReadWriteBuffer
+      : public std::vector<T, AlignedAllocator<T, AlignBytes>> {
+   public:
+    static_assert(sizeof(T) % AlignBytes == 0 || AlignBytes % sizeof(T) == 0,
+                  "Type T must be able to aligned.");
+
+    /**
+     * @brief IsTLargerThanAlign compiled time calculated constant for is type
+     * T larger than alignments.
+     */
+    constexpr static bool IsTLargerThanAlign = sizeof(T) >= AlignBytes;
+
+    static_assert(std::is_pod<T>::value, "T must be POD type.");
+
+    /**
+     * @brief if AlignBytes > sizeof(T), then will calcuate how many elements
+     * can be stored in AlignBytes.
+     */
+    constexpr static size_t AlignElementCount = AlignBytes / sizeof(T);
+
+    static_assert(AlignElementCount ==
+                          (AlignElementCount & -AlignElementCount) ||
+                      AlignBytes > sizeof(T),
+                  "AlignElementCount should be exp of 2");
+
+    /**
+     * @brief Resize Buffer, with block count that will be allocated. Each block
+     * will be memory aligned in AlignBytes.
+     * @param size The element count in all blocks.
+     * @param alignBlockCount The block count that will be allocated.
+     */
+    void resizeWithAlignHints(size_t size, size_t alignBlockCount = 1) {
+      if (IsTLargerThanAlign) {  //! So, each elements is memory aligned.
+        this->resize(size);
+      } else {
+        //! at most, we need such elements in buffer to make sure each block is
+        //! aligned.
+        this->resize(size + alignBlockCount * (AlignElementCount - 1));
+      }
+    }
+
+    /**
+     * @brief reset aligned allocate blocks.
+     */
+    void resetAlignAlloc() { this->curOffset_ = 0; }
+
+    /**
+     * @brief get next aligned block address.
+     * @param blockSize is the element count in each block.
+     * @return Aligned block address.
+     */
+    T* nextBlock(size_t blockSize) {
+      T* r = &this->operator[](curOffset_);
+      curOffset_ += blockSize;
+
+      if (!IsTLargerThanAlign) {
+        curOffset_ =
+            (curOffset_ + AlignElementCount - 1) & ~(AlignElementCount - 1);
+      }
+      return r;
+    }
+
+   private:
+    size_t curOffset_;
+  };
+
+  /// to buffer the data from network for further processing to
+  /// reduce redundant memory allocation.
+  ThreadLocal<ReadWriteBuffer<real, ALIGN_HINT>> readWriteBuffer_;
+
+  /// size of the parameter
+  int64_t size_;
+
+  /// for synchronized training, check details in addGradient()
+  /// and doOperation()
+  ThreadBarrier gradientReadyBarrier_;
+  ThreadBarrier parameterReadyBarrier_;
+  ThreadBarrier passBarrier_;
+  ThreadLocal<std::vector<SendParameterRequest>> requestVec_;
+  ThreadLocal<std::vector<ProtoResponseCallbackEx>> callbackVec_;
+
+  std::atomic<int> numPassFinishClients_;
+  bool allClientPassFinish_;
+
+  std::vector<std::unique_ptr<ThreadBarrier>> synchronizeBarriers_;
+  std::atomic<int> serverId_;
+
+  /**
+   *
+   * for lagged async gradient gradient commit control in Async Sgd.
+   * discard lagged gradients from too slow nodes, whose gradients
+   * exhibits bad quality.
+   * Algorithm:
+   * pserver:
+   * 1. initial asyncUpdaterSteps = 0, asyncTrainerSteps_[N] = 0.
+   * syncUpdaterSteps means
+   *    the version of parameter value.
+   * 2. when pull arrives, record asyncUpdateSteps_ into
+   * syncTrainerSteps_[trainer_id]
+   * 3. when push arrives, compare asyncUpdateSteps_ with
+   * syncTrainerSteps_[trainer_id]
+   *    if delta > threshold, discard current gradient, else commit
+   *    gradient.
+   * 4. reset asyncUpdaterSteps_ and asyncTrainerSteps_[N] when pass
+   * finished
+   * Note:
+   * it can not discard all lag-gradient strictly in some special
+   * condition. part of gradients could be discarded if
+   * ConcurrentRemoteParameterUpdater is sed.
+   * this algorithm is implemented in asynSGD()
+   */
+  int64_t asyncLaggedThreshold_;
+  std::atomic<int64_t> asyncUpdateSteps_;
+  std::vector<int64_t> asyncTrainerSteps_;
+  size_t asyncLaggedGradientsNum_;
+  /// stat all async update
+  std::vector<size_t> asyncUpdateStat_;
+  /// stat per trainer_id
+  std::vector<size_t> asyncTrainerDiscardStat_;
+  /// stat per trainer_id
+  std::vector<size_t> asyncTrainerCommitStat_;
+
+  /// only used by controller and other control cmd from trainer number 0
+  std::unique_ptr<SyncThreadPool> syncThreadPool_;
+
+  /// pserver for sparse remote update parameters
+  bool isSparseServer_;
+
+  /// barrier performance tuning sync-sgd required
+  std::atomic<int64_t> batchId_;
+
+ public:
+  struct Buffer {
+    real* base;
+    size_t size;
+  };
+
+ protected:
+  /// async gradient commit control
+  bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
+
+ public:
+  /// disable default parameter for overloading
+  /// @rdmaCpu:the id of cpu core hosting RDMA server(0-N)
+  /// -1 means using TCP transport instead of RDMA
+  ParameterServer2(const std::string& addr, int port, int rdmaCpu = -1);
+
+  ~ParameterServer2() {}
+
+  static const std::string kRetMsgInvalidMatrixHandle;
+  static const std::string kRetMsgInvalidVectorHandle;
+  static const std::string kRetMsgUnknownOperation;
+
+  /// service functions
+  template <typename Dtype>
+  void reduceAndSendData(const SendDataRequest& request,
+                         std::unique_ptr<MsgReader>& msgReader,
+                         ProtoResponseCallbackEx& callback);
+
+  void templateReduceSum(const SendDataRequest& request,
+                         std::unique_ptr<MsgReader>& msgReader,
+                         ProtoResponseCallbackEx& callback);
+
+  /**
+   * @brief framework for sending parameters
+   *
+   * @note  different parameter data type can be sent to pserver.
+   *        in most case, the api is used to send gradients from
+   *        trainer to pserver.
+   *        it also can be used to retrieve parameters from pserver
+   */
+  void sendParameter(const SendParameterRequest& request,
+                     std::unique_ptr<MsgReader> msgReader,
+                     ProtoResponseCallbackEx callback);
+
+  void sendData(const SendDataRequest& request,
+                std::unique_ptr<MsgReader> msgReader,
+                ProtoResponseCallbackEx callback);
+
+  /**
+   * @brief send config to pserver
+   *
+   * @note  it can help pserver to understand the configuration for
+   * optimization,
+   *        logging control, duplicated initialization, etc.
+   */
+  void setConfig(const SetConfigRequest& request,
+                 ProtoResponseCallback callback);
+
+  /**
+   * @brief get status for pserver
+   *
+   * @note  used to check if parameters are ready at pserver
+   */
+  void getStatus(const GetStatusRequest& request,
+                 ProtoResponseCallback callback);
+
+  /**
+   * @brief set status for pserver
+   *
+   * @note  used to check if parameters are ready at pserver, since parameters
+   *        at pserver are initialized by trainer
+   */
+  void setStatus(const SetStatusRequest& request,
+                 ProtoResponseCallback callback);
+
+  /**
+   * @brief framework for doing some operation at pserver end
+   *
+   * @note  if sync-sgd is used, controller will calling op_SGD action
+   *        for gradient optimization.
+   *        check avaiable operations in opFuncs[]
+   */
+  void doOperation(const DoOperationRequest& request,
+                   ProtoResponseCallback callback);
+
+  /// Create a column vector. The size is the dimension of parameter
+  void createVector(const CreateVectorRequest& request,
+                    ProtoResponseCallback callback);
+
+  void releaseVector(const ReleaseVectorRequest& request,
+                     ProtoResponseCallback callback);
+
+  /// Create a column major matrix. The number of rows is the dimension of
+  /// parameter. The number of columns is specifed by num_cols.
+  void createMatrix(const CreateMatrixRequest& request,
+                    ProtoResponseCallback callback);
+
+  void releaseMatrix(const ReleaseMatrixRequest& request,
+                     ProtoResponseCallback callback);
+  /**
+   * @brief stateful control for indicationg sync pass start
+   *
+   * @note  it is valuable for logging and state control,
+   *        especially for sync-sgd control
+   */
+  void waitPassStart(const WaitPassStartRequest& request,
+                     ProtoResponseCallback callback);
+
+  /**
+   * @brief stateful control for indicationg sync pass end
+   *
+   * @note  it is valuable for logging and state control,
+   *        especially for sync-sgd control
+   */
+  void waitPassFinish(const WaitPassFinishRequest& request,
+                      ProtoResponseCallback callback);
+
+  /**
+   * @brief synchronize all distributed trainers
+   *
+   * @note  it's general api for synchronizing trainer and pserver
+   */
+  void synchronize(const SynchronizeRequest& request,
+                   ProtoResponseCallback callback);
+
+  /**
+   * @brief stateful control for indicating async pass is finished
+   *
+   * @note  it is valuable for logging control, state reset, etc.
+   */
+  void asyncFinishPass(const SynchronizeRequest& request,
+                       ProtoResponseCallback callback);
+
+  void loadValueVector(const LoadValueRequest& request,
+                       ProtoResponseCallback callback);
+
+  void saveValueVector(const SaveValueRequest& request,
+                       ProtoResponseCallback callback);
+
+ public:
+  /**
+   * @brief initialize parameter server
+   */
+  bool init();
+
+  /**
+   * @brief set parameters at pserver
+   *
+   * @note  do parameter initialization if neccessy.
+   */
+  void setParameter(const SendParameterRequest& request,
+                    std::vector<Buffer>& inputBuffers,
+                    SendParameterResponse* response,
+                    std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief receive gradients and do optimization for async-sgd
+   *
+   * @note  this api asynchronizately receives all data from all
+   *        trainers, and immediately do optimization and return
+   *        optimizated value for trainer.
+   *        this above routine are block based atomic updating,
+   *        which means different block could based different stale
+   *        gradient.
+   *        it will discard some lagged gradients by default for
+   *        better convergence.
+   */
+  void asyncSGD(const SendParameterRequest& request,
+                std::vector<Buffer>& inputBuffers,
+                SendParameterResponse* response,
+                std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief merge gradients from all trainer
+   *
+   * @note  this api use block based parallelization as fine grained
+   *        parallelization which benifits lock contention and latency
+   *        hidden for communication, also can harness multi-core
+   *        efficiently.
+   *        it also implements the synchronization for sync-sgd
+   */
+  void addGradient(const SendParameterRequest& request,
+                   std::vector<Buffer>& inputBuffers,
+                   SendParameterResponse* response,
+                   std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief get dense parameters from pserver
+   *
+   * @note  for some specified condition, trainer will get parameters from
+   *        pservers.
+   *        e.g.
+   *        if all parameters are stored at perver end for big model training
+   *        trainer can use it to retrieve all parameters if necessary.
+   */
+  void getParameter(const SendParameterRequest& request,
+                    std::vector<Buffer>& inputBuffers,
+                    SendParameterResponse* response,
+                    std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief get sparse value from parameter server
+   *
+   * @note  with sparse enabled, pservers own all latest value
+   *        while trainer only retrieve value that only are needed.
+   *        e.g.
+   *        trainer will do prefetch action to retrieve necessary latest
+   *        value from pserver for sparse calculation.
+   */
+  void getParameterSparse(const SendParameterRequest& request,
+                          std::vector<Buffer>& inputBuffers,
+                          SendParameterResponse* response,
+                          std::vector<Buffer>* outputBuffers);
+
+ protected:
+  void mergeSegments(BlockSegments* segments);
+
+  /// set the unused segments to zero
+  void clearUnusedSegments(CpuVector* vec);
+
+  // TODO(yanfei):
+  // if read data and do optimization interleavely block by block,
+  // the performance could be better for gaining less network congestion.
+  /// read all data from connection and store it in static pre-allocated buffer
+  void readAllBlocks(MsgReader* msgReader,
+                     std::vector<ParameterServer2::Buffer>* buffers);
+
+  const ParameterConfig& getParameterConfig(const ParameterBlock& block) {
+    CHECK_LT(block.para_id(), -1UL) << "invalid parameter id:"
+                                    << block.para_id();
+    const auto it = configMap_.find(block.para_id());
+    CHECK(it != configMap_.end()) << "can not find parameter id: "
+                                  << block.para_id();
+    return it->second;
+  }
+
+  /// it implictly check blockOffsetMap_ while retrieving blockId
+  const ParameterConfig& getParameterConfig(int64_t blockId) const {
+    CHECK(blockId >= 0 && blockId < (int64_t)blockInfos_.size())
+        << "block idx out of range, id: " << blockId
+        << " info size: " << blockInfos_.size();
+    return *(blockInfos_[blockId].config);
+  }
+
+  template <class Response>
+  bool isValidVectorHandle(int64_t handle, Response* response) {
+    if (handle < 0 || (size_t)handle >= vectors_.size()) {
+      LOG(ERROR) << "Invalid vector handle " << handle;
+      response->set_return_message(kRetMsgInvalidVectorHandle);
+      return false;
+    }
+    return true;
+  }
+
+  template <class Response>
+  bool isValidMatrixHandle(int64_t handle, Response* response) {
+    if (handle < 0 || (size_t)handle >= matrices_.size()) {
+      LOG(ERROR) << "Invalid matrix handle " << handle;
+      response->set_return_message(kRetMsgInvalidMatrixHandle);
+      return false;
+    }
+    return true;
+  }
+
+  /**
+   * @brief get block offset
+   *
+   * @note  block.begin_dim is added to the block offset.
+   *        return -1 if block cannot be found
+   */
+  int64_t getBlockOffset(const ParameterBlock& block) const {
+    BlockKey key(block.para_id(), block.block_id());
+    auto it = blockOffsetMap_.find(key);
+    if (it == blockOffsetMap_.end()) {
+      return -1;
+    }
+    return it->second;
+  }
+
+  /// return -1 if block cannot be found
+  int64_t getBlockId(const ParameterBlock& block) const {
+    BlockKey key(block.para_id(), block.block_id());
+    auto it = blockIdMap_.find(key);
+    if (it == blockIdMap_.end()) {
+      return -1;
+    }
+    return it->second;
+  }
+
+  /**
+   * @brief prepare data for sending back
+   *
+   * @note  modify reponse and outputBuffers for sending parameter
+   *        back to client. The buffer for socket sending uses
+   *        vectors_[parameterType] directly
+   *        for dense with sync-sgd
+   */
+  void sendBackParameter(const ParameterBlock& block,
+                         int parameterType,
+                         SendParameterResponse* response,
+                         std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief prepare data for sending back
+   *
+   * @note  modify response and outputBuffers for sending parameter
+   *        back to client. The buffer for socket sending uses buffer->base
+   *        The parameter values are copied from vectors_[parameterType]
+   *        to buffer->base.
+   *        for dense with async-sgd
+   */
+  void sendBackParameter(const ParameterBlock& block,
+                         int parameterType,
+                         SendParameterResponse* response,
+                         Buffer* buffer,
+                         std::vector<Buffer>* outputBuffers);
+  /**
+   * @brief prepare data for sending back
+   *
+   * @note  specified for sparse
+   */
+  void sendBackParameterSparse(const ParameterBlock& block,
+                               int parameterType,
+                               SendParameterResponse* response,
+                               Buffer* buffer,
+                               size_t width,
+                               std::vector<Buffer>* outputBuffers);
+
+  /**
+   * framework routine for block parallelization
+   * e.g.
+   * for optimization on all blocks at pserver end, this routine can facilitize
+   * the parallelize of do optimization on all blocks with multithreads.
+   */
+  typedef std::function<void(int64_t blockId, const VectorPtr vecs[])> ExecFunc;
+  void parallelExecForEachBlock(ExecFunc func);
+  void blockTraverse(BlockInfo& info,
+                     const ParameterConfig& config,
+                     int64_t offset,
+                     size_t size,
+                     const VectorPtr vecs[],
+                     const ParameterOptimizer::TraverseCallback& callback);
+
+ public:
+  typedef void (ParameterServer2::*OperatorFunction)(const Operation& operation,
+                                                     OperationResult* result);
+
+  /**
+   * doOperation will call following operations indirectly
+   * e.g.
+   * for sync-sgd control, the controller in remote updater will send op_SGD
+   * command to pserver, then send sendParameter request to pserver immediately.
+   * the two function at pserver end will do cooperation to achieve the sync-sgd
+   * gradient merge and optimization.
+   * the most following operations are specified for owlqn, all operations are
+   * under the context of doOperation function
+   */
+  static OperatorFunction opFuncs[];
+
+  void op_SGD(const Operation& operation, OperationResult* result);
+
+  void op_RESET(const Operation& operation, OperationResult* result);
+
+  void op_utv(const Operation& operation, OperationResult* result);
+
+  void op_au_bv(const Operation& operation, OperationResult* result);
+
+  void op_COPY(const Operation& operation, OperationResult* result);
+
+  void op_au(const Operation& operation, OperationResult* result);
+
+  void op_au_bv_cw(const Operation& operation, OperationResult* result);
+
+  void op_make_steepest_desc_dir(const Operation& operation,
+                                 OperationResult* result);
+
+  void op_fix_dir_signs(const Operation& operation, OperationResult* result);
+
+  void op_dir_deriv(const Operation& operation, OperationResult* result);
+
+  void op_fix_omega_signs(const Operation& operation, OperationResult* result);
+
+  void op_cost(const Operation& operation, OperationResult* result);
+
+  void op_start_pass(const Operation& operation, OperationResult* result);
+  void op_finish_pass(const Operation& operation, OperationResult* result);
+
+  void op_apply(const Operation& operation, OperationResult* result);
+
+  void op_randomize(const Operation& operation, OperationResult* result);
+
+  void op_load(const Operation& operation, OperationResult* result);
+  void op_save(const Operation& operation, OperationResult* result);
+};
+
+}  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/legacy/pserver/ParameterServer2Main.cpp
similarity index 100%
rename from paddle/pserver/ParameterServer2Main.cpp
rename to paddle/legacy/pserver/ParameterServer2Main.cpp
diff --git a/paddle/pserver/ParameterServerController.cpp b/paddle/legacy/pserver/ParameterServerController.cpp
similarity index 100%
rename from paddle/pserver/ParameterServerController.cpp
rename to paddle/legacy/pserver/ParameterServerController.cpp
diff --git a/paddle/pserver/ParameterServerController.h b/paddle/legacy/pserver/ParameterServerController.h
similarity index 100%
rename from paddle/pserver/ParameterServerController.h
rename to paddle/legacy/pserver/ParameterServerController.h
diff --git a/paddle/pserver/ProtoServer.cpp b/paddle/legacy/pserver/ProtoServer.cpp
similarity index 100%
rename from paddle/pserver/ProtoServer.cpp
rename to paddle/legacy/pserver/ProtoServer.cpp
diff --git a/paddle/pserver/ProtoServer.h b/paddle/legacy/pserver/ProtoServer.h
similarity index 100%
rename from paddle/pserver/ProtoServer.h
rename to paddle/legacy/pserver/ProtoServer.h
diff --git a/paddle/pserver/RDMANetwork.h b/paddle/legacy/pserver/RDMANetwork.h
similarity index 100%
rename from paddle/pserver/RDMANetwork.h
rename to paddle/legacy/pserver/RDMANetwork.h
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/legacy/pserver/SocketChannel.cpp
similarity index 100%
rename from paddle/pserver/SocketChannel.cpp
rename to paddle/legacy/pserver/SocketChannel.cpp
diff --git a/paddle/pserver/SocketChannel.h b/paddle/legacy/pserver/SocketChannel.h
similarity index 100%
rename from paddle/pserver/SocketChannel.h
rename to paddle/legacy/pserver/SocketChannel.h
diff --git a/paddle/pserver/SparseParameterDistribution.cpp b/paddle/legacy/pserver/SparseParameterDistribution.cpp
similarity index 100%
rename from paddle/pserver/SparseParameterDistribution.cpp
rename to paddle/legacy/pserver/SparseParameterDistribution.cpp
diff --git a/paddle/pserver/SparseParameterDistribution.h b/paddle/legacy/pserver/SparseParameterDistribution.h
similarity index 100%
rename from paddle/pserver/SparseParameterDistribution.h
rename to paddle/legacy/pserver/SparseParameterDistribution.h
diff --git a/paddle/pserver/test/.gitignore b/paddle/legacy/pserver/test/.gitignore
similarity index 100%
rename from paddle/pserver/test/.gitignore
rename to paddle/legacy/pserver/test/.gitignore
diff --git a/paddle/pserver/test/CMakeLists.txt b/paddle/legacy/pserver/test/CMakeLists.txt
similarity index 100%
rename from paddle/pserver/test/CMakeLists.txt
rename to paddle/legacy/pserver/test/CMakeLists.txt
diff --git a/paddle/legacy/pserver/test/SocketTest.cpp b/paddle/legacy/pserver/test/SocketTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb9ee355ddbd1333cb45965e6756938d7f44fe38
--- /dev/null
+++ b/paddle/legacy/pserver/test/SocketTest.cpp
@@ -0,0 +1,256 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Util.h"
+
+#include <netdb.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <thread>
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/utils/Logging.h"
+
+struct MessageHeader {
+  int64_t dataLength;
+};
+
+class Thread {
+ public:
+  void start();
+  virtual void run() = 0;
+  virtual ~Thread() {}
+
+ protected:
+  std::unique_ptr<std::thread> thread_;
+};
+
+void Thread::start() {
+  thread_.reset(new std::thread([this]() { this->run(); }));
+}
+
+class SocketChannel {
+ public:
+  explicit SocketChannel(int socket) : socket_(socket) {}
+  int getSocketFd() const { return socket_; }
+  uint64_t readAll(void* buf, size_t size);
+  uint64_t writeAll(const void* buf, size_t size);
+
+ protected:
+  int socket_;
+};
+
+uint64_t SocketChannel::readAll(void* buf, size_t size) {
+  uint64_t total = 0;
+  while (total < size) {
+    int64_t len = read(socket_, (char*)buf + total, size - total);
+    if (len <= 0) {
+      return total;
+    }
+    total += len;
+  }
+  return total;
+}
+
+uint64_t SocketChannel::writeAll(const void* buf, size_t size) {
+  uint64_t total = 0;
+  while (total < size) {
+    int64_t len = write(socket_, (const char*)buf + total, size - total);
+    if (len <= 0) {
+      return total;
+    }
+    total += len;
+  }
+  return total;
+}
+
+class SocketWorker : public Thread {
+ public:
+  explicit SocketWorker(int socket) : channel_(socket) {}
+  virtual void run();
+
+  // read n bytes.
+  int64_t readAll(char* buf, size_t n);
+
+  // write n bytes
+
+ protected:
+  SocketChannel channel_;
+  std::string buffer_;
+};
+
+class SocketServer : public Thread {
+ public:
+  explicit SocketServer(int port)
+      : port_(port), socket_(0), maxPendingConnections_(100) {}
+
+  virtual void run();
+
+ protected:
+  int port_;
+  int socket_;
+  int maxPendingConnections_;
+};
+
+void SocketServer::run() {
+  int newsockfd;
+  socklen_t clilen;
+  struct sockaddr_in serv_addr, cli_addr;
+
+  /* First call to socket() function */
+  socket_ = socket(AF_INET, SOCK_STREAM, 0);
+  CHECK(socket_ >= 0) << "ERROR opening socket";
+
+  /* Initialize socket structure */
+  bzero((char*)&serv_addr, sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  serv_addr.sin_addr.s_addr = INADDR_ANY;
+  serv_addr.sin_port = htons(port_);
+
+  /* Now bind the host address using bind() call.*/
+  CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
+      << "ERROR on binding";
+
+  /* Now start listening for the clients, here process will
+   * go in sleep mode and will wait for the incoming connection
+   */
+  listen(socket_, maxPendingConnections_);
+  clilen = sizeof(cli_addr);
+
+  while (true) {
+    /* Accept actual connection from the client */
+    newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
+    CHECK(newsockfd >= 0) << "ERROR on accept";
+
+    SocketWorker* worker = new SocketWorker(newsockfd);
+    worker->start();
+  }
+}
+
+void SocketWorker::run() {
+  MessageHeader header;
+
+  while (true) {
+    int64_t n = channel_.readAll(&header, sizeof(header));
+    CHECK(n == sizeof(header)) << "ERROR reading from socket";
+
+    buffer_.resize(header.dataLength);
+    n = channel_.readAll(&buffer_[0], header.dataLength);
+    CHECK(n == header.dataLength) << "ERROR reading from socket";
+
+    /* Write a response to the client */
+    n = channel_.writeAll(&header, sizeof(header));
+    CHECK(n == sizeof(header)) << "ERROR reading from socket";
+    n = channel_.writeAll(buffer_.data(), buffer_.size());
+    CHECK(n == header.dataLength) << "ERROR writing to socket";
+  }
+}
+
+class SocketClient {
+ public:
+  SocketClient(const std::string& serverAddr, int serverPort);
+  SocketChannel* getChannel() const { return channel_.get(); }
+
+ protected:
+  std::unique_ptr<SocketChannel> channel_;
+};
+
+SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
+  struct sockaddr_in serv_addr;
+  struct hostent* server;
+
+  // char buffer[256];
+
+  /* Create a socket point */
+  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
+  CHECK(sockfd >= 0) << "ERROR opening socket";
+  server = gethostbyname(serverAddr.c_str());
+  CHECK(server) << "ERROR, no such host: " << serverAddr;
+
+  bzero((char*)&serv_addr, sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  bcopy((char*)server->h_addr,
+        (char*)&serv_addr.sin_addr.s_addr,
+        server->h_length);
+  serv_addr.sin_port = htons(serverPort);
+
+  /* Now connect to the server */
+  CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
+      << "ERROR connecting";
+
+  channel_.reset(new SocketChannel(sockfd));
+}
+
+DEFINE_string(server_addr, "127.0.0.1", "Server address");
+DEFINE_int64(dim, 10000000, "Data size");
+DEFINE_int32(loop_time, 100000, "test loop time");
+
+using namespace paddle;  // NOLINT
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  SocketServer server(FLAGS_port);
+  server.start();
+  sleep(1);
+
+  SocketClient client(FLAGS_server_addr, FLAGS_port);
+
+  SocketChannel* channel = client.getChannel();
+
+  MessageHeader header;
+
+  uint64_t dataSize = FLAGS_dim * sizeof(real);
+
+#ifdef PADDLE_WITH_CUDA
+  GpuVector gpuParam(FLAGS_dim);
+  GpuVector gpuGrad(FLAGS_dim);
+#else
+  CpuVector gpuParam(FLAGS_dim);
+  CpuVector gpuGrad(FLAGS_dim);
+#endif
+  CpuVector cpuParam(FLAGS_dim);
+  CpuVector cpuGrad(FLAGS_dim);
+
+  gpuParam.rand();
+  gpuGrad.rand();
+  cpuParam.rand();
+  cpuGrad.rand();
+
+  for (int i = 0; i < FLAGS_loop_time; ++i) {
+    cpuGrad.copyFrom(gpuGrad);
+
+    header.dataLength = dataSize;
+    CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
+        << "Client write header error";
+
+    CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
+        << "Client write data error";
+
+    /* Now read server response */
+    CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
+        << "Client read header error";
+
+    CHECK_EQ((uint64_t)header.dataLength, dataSize);
+    CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
+        << "Client read data error";
+
+    gpuParam.copyFrom(cpuParam);
+
+    LOG_EVERY_N(INFO, 100) << "i=" << i;
+  }
+  exit(0);
+}
diff --git a/paddle/legacy/pserver/test/test_ParameterServer2.cpp b/paddle/legacy/pserver/test/test_ParameterServer2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..60419f3a4abad8b209706d568c52f3d4761b5445
--- /dev/null
+++ b/paddle/legacy/pserver/test/test_ParameterServer2.cpp
@@ -0,0 +1,624 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/pserver/ParameterClient2.h>
+#include <paddle/legacy/pserver/ParameterServer2.h>
+#include <paddle/utils/Flags.h>
+#include <paddle/utils/Util.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(num_gradient_servers);
+DEFINE_string(server_addr, "127.0.0.1", "assign server address");
+DEFINE_int32(server_cpu, 0, "assign server cpu");
+
+class ParameterServer2Tester : public ParameterServer2 {
+ public:
+  ParameterServer2Tester(std::string serverAddr,
+                         int port,
+                         int rdmaCpu = -1,
+                         bool sepSendAndRecv = false)
+      : ParameterServer2(serverAddr, port, rdmaCpu), client_(sepSendAndRecv) {}
+  virtual ~ParameterServer2Tester() {}
+  void setup() {
+    CHECK(ParameterServer2::init());
+
+    parameters_.clear();
+    clientConfigs_.clear();
+
+    clientConfigs_.resize(2);
+    {
+      ParameterConfig& config = clientConfigs_[0];
+      config.set_name("para0");
+      config.set_para_id(0);
+      config.set_size(10000);
+      config.set_device(-1);
+      config.set_learning_rate(1.0);
+      config.set_momentum(0.9);
+    }
+
+    {
+      ParameterConfig& config = clientConfigs_[1];
+      config.set_name("para1");
+      config.set_para_id(1);
+      config.set_size(5000);
+      config.set_device(-1);
+      config.set_learning_rate(0.5);
+      config.set_momentum(0.4);
+    }
+
+    for (auto& config : clientConfigs_) {
+      parameters_.emplace_back(new Parameter(config, /* useGpu= */ false));
+    }
+
+    size_t id = 0;
+    for (auto& para : parameters_) {
+      para->setID(id++);
+    }
+
+    CHECK(client_.init(parameters_));
+    OptimizationConfig optConfig;
+    optConfig.set_algorithm("async_sgd");
+    optConfig.set_batch_size(100);
+    optConfig.set_learning_rate(0.1);
+    client_.setConfig(optConfig);
+    client_.setParameter();
+  }
+
+  void setConfigTest();
+  void setStatusTest();
+  void sendParameterTest();
+  void sendDataTest(SendDataType type, size_t size);
+  void operationTest();
+  void mergeBlockSegmentTest();
+  void checkSegments(const BlockSegments& expected, const BlockSegments& segs);
+  void waitPassFinishTest();
+  void synchronizeTest();
+
+ protected:
+  ParameterClient2 client_;
+  vector<ParameterConfig> clientConfigs_;
+  vector<ParameterPtr> parameters_;
+};
+
+std::unique_ptr<ParameterServer2Tester> g_server;
+
+void ParameterServer2Tester::setConfigTest() {
+  setup();
+
+  for (auto& config : clientConfigs_) {
+    auto it = configMap_.find(config.para_id());
+    EXPECT_TRUE(it != configMap_.end());
+    auto& serverConfig = it->second;
+    EXPECT_EQ(config.name(), serverConfig.name());
+    EXPECT_EQ(config.size(), serverConfig.size());
+    EXPECT_EQ(config.learning_rate(), serverConfig.learning_rate());
+    EXPECT_EQ(config.momentum(), serverConfig.momentum());
+  }
+}
+
+void ParameterServer2Tester::setStatusTest() {
+  setup();
+  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_NOT_SET));
+  client_.setStatus(PSERVER_STATUS_PARAMETER_READY);
+  EXPECT_EQ(PSERVER_STATUS_PARAMETER_READY, status_);
+  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_PARAMETER_READY));
+}
+
+real sumVector(const CpuVector& vec) {
+  const real* data = vec.getData();
+  size_t dim = vec.getSize();
+  real sum = 0;
+  for (size_t i = 0; i < dim; ++i) {
+    sum += data[i];
+  }
+  return sum;
+}
+
+void ParameterServer2Tester::sendParameterTest() {
+  setup();
+
+  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
+                                  PARAMETER_VALUE,
+                                  0,       // numSamples = 0
+                                  0,       // cost = 0
+                                  false);  // sendBackParameter = false
+
+  vector<ParameterPtr> parameterCopies;
+
+  for (auto& parameter : parameters_) {
+    parameterCopies.emplace_back(
+        new Parameter(parameter->getConfig(), /* useGpu= */ false));
+    parameterCopies.back()
+        ->getBuf(PARAMETER_VALUE)
+        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
+  }
+
+  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
+                                  PARAMETER_VALUE,
+                                  0,      // numSamples = 0
+                                  0,      // cost = 0
+                                  true);  // sendBackParameter = true
+
+  for (size_t i = 0; i != parameters_.size(); ++i) {
+    real* v1 = parameters_[i]->getBuf(PARAMETER_VALUE)->getData();
+    real* v2 = parameterCopies[i]->getBuf(PARAMETER_VALUE)->getData();
+    EXPECT_EQ(parameters_[i]->getSize(), parameterCopies[i]->getSize());
+    size_t size = parameters_[i]->getSize();
+    real sum1 = 0, sum2 = 0;
+    for (size_t j = 0; j < size; ++j) {
+      sum1 += v1[j];
+      sum2 += v2[j];
+    }
+    EXPECT_EQ(sum1, sum2);
+  }
+}
+
+void ParameterServer2Tester::sendDataTest(SendDataType type, size_t size) {
+  ParameterClient2 client1(true);
+  client1.init(parameters_);
+  ParameterClient2 client2(true);
+  client2.init(parameters_);
+  ParameterClient2 client3(true);
+  client3.init(parameters_);
+
+  ThreadWorker worker1;
+  ThreadWorker worker2;
+  ThreadWorker worker3;
+
+  double* testData1 = new double[size];
+  double* testData2 = new double[size];
+  double* testData3 = new double[size];
+  double* getDataExpect = new double[size];
+  double* getDataReal = new double[size];
+  for (size_t i = 0; i < size; ++i) {
+    testData1[i] = rand();  // NOLINT TODO(yuyang18): Use rand_r instead.
+    testData2[i] = rand();  // NOLINT
+    testData3[i] = rand();  // NOLINT
+    getDataExpect[i] = testData1[i] + testData2[i] + testData3[i];
+  }
+
+  auto put1 = [&]() {
+    LOG(INFO) << "putOwnData1 start";
+    client1.putOwnData(0, type, testData1, size);
+    LOG(INFO) << "putOwnData1 finish";
+  };
+
+  auto get1 = [&]() {
+    LOG(INFO) << "sendData1 get all start";
+    client1.getAllData(0, type, getDataReal, size);
+    for (size_t i = 0; i < size; ++i) {
+      CHECK_EQ(getDataReal[i], getDataExpect[i]);
+    }
+    LOG(INFO) << "sendData1 get all finish";
+  };
+
+  auto put2 = [&]() {
+    LOG(INFO) << "putOwnData2 start";
+    client2.putOwnData(1, type, testData2, size);
+    LOG(INFO) << "putOwnData2 finish";
+  };
+
+  auto put3 = [&]() {
+    LOG(INFO) << "putOwnData3 start";
+    client3.putOwnData(2, type, testData3, size);
+    LOG(INFO) << "putOwnData3 finish";
+  };
+
+  worker1.addJob(put1);
+  worker1.addJob(get1);
+  worker2.addJob(put2);
+  worker3.addJob(put3);
+
+  worker1.addJob(put1);
+  worker2.addJob(put2);
+  worker3.addJob(put3);
+  worker1.addJob(get1);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+  free(testData1);
+  free(testData2);
+  free(testData3);
+  free(getDataExpect);
+  free(getDataReal);
+}
+
+void ParameterServer2Tester::operationTest() {
+  PServerVector v1, v2;
+  v1 = client_.createVector();
+  EXPECT_EQ(NUM_PARAMETER_TYPES, v1.handle);
+
+  v2 = client_.createVector();
+  EXPECT_EQ(NUM_PARAMETER_TYPES + 1, v2.handle);
+
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_RESET, v1, (real)1);
+  ops.addOperation(PSERVER_OP_RESET, v2, (real)2);
+
+  real res1, res2, res3;
+  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res1);
+
+  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
+  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res2);
+
+  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
+  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res3);
+  client_.doOperation(ops, false, false);
+
+  EXPECT_EQ(30000, res1);
+  EXPECT_EQ(15000, res2);
+  EXPECT_EQ(0, res3);
+
+  PServerMatrix m1, m2;
+  m1 = client_.createMatrix(4);
+  EXPECT_EQ(0, m1.handle);
+  m2 = client_.createMatrix(8);
+  EXPECT_EQ(1, m2.handle);
+
+  // TODO(yuyang18): add tests for other operations OP_COPY, OP_au
+
+  client_.releaseVector(v1);
+  client_.releaseVector(v2);
+  client_.releaseMatrix(m1);
+  client_.releaseMatrix(m2);
+}
+
+void ParameterServer2Tester::checkSegments(const BlockSegments& expected,
+                                           const BlockSegments& segs) {
+  EXPECT_EQ(expected.size(), segs.size());
+  if (expected.size() != segs.size()) {
+    return;
+  }
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(expected[i], segs[i]);
+  }
+}
+
+void ParameterServer2Tester::mergeBlockSegmentTest() {
+  {
+    BlockSegments segs{{10, 20}, {30, 45}, {50, 70}};
+    mergeSegments(&segs);
+    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 20}};
+    mergeSegments(&segs);
+    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 30}};
+    mergeSegments(&segs);
+    checkSegments({{10, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {10, 70}, {10, 30}};
+    mergeSegments(&segs);
+    checkSegments({{10, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 35}};
+    mergeSegments(&segs);
+    checkSegments({{10, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 60}};
+    mergeSegments(&segs);
+    checkSegments({{10, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {30, 47}};
+    mergeSegments(&segs);
+    checkSegments({{30, 47}, {50, 70}}, segs);
+  }
+}
+
+void ParameterServer2Tester::waitPassFinishTest() {
+  ParameterClient2 client1;
+  ParameterClient2 client2;
+  ParameterClient2 client3;
+
+  ThreadWorker worker1;
+  ThreadWorker worker2;
+  ThreadWorker worker3;
+
+  auto init1 = [&]() {
+    LOG(INFO) << "init1 start";
+    client1.init(parameters_);
+    LOG(INFO) << "init1 finish";
+  };
+
+  auto init2 = [&]() {
+    LOG(INFO) << "init2 start";
+    client2.init(parameters_);
+    LOG(INFO) << "init2 finish";
+  };
+
+  auto init3 = [&]() {
+    LOG(INFO) << "init3 start";
+    client3.init(parameters_);
+    LOG(INFO) << "init3 finish";
+  };
+
+  auto update1 = [&]() {
+    LOG(INFO) << "update1 start";
+    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update1 finish";
+  };
+
+  auto wait1 = [&]() {
+    LOG(INFO) << "wait1 start";
+    client1.waitPassFinish();
+    LOG(INFO) << "wait1 finish";
+  };
+
+  auto update2 = [&]() {
+    LOG(INFO) << "update2 start";
+    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update2 finish";
+  };
+
+  auto wait2 = [&]() {
+    LOG(INFO) << "wait2 start";
+    client2.waitPassFinish();
+    LOG(INFO) << "wait2 finish";
+  };
+
+  auto op3 = [&]() {
+    LOG(INFO) << "op3 start";
+    PreparedOperations ops;
+    ops.addOperation(PSERVER_OP_SGD);
+    client3.doOperation(ops,
+                        /* waitForGradient= */ true,
+                        /* sendBackarameter= */ true);
+    LOG(INFO) << "op3 finish";
+  };
+
+  worker1.addJob(init1);
+  worker2.addJob(init2);
+  worker3.addJob(init3);
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+  worker3.addJob(op3);
+
+  worker3.addJob(op3);
+  worker3.addJob(op3);
+  worker2.addJob(update2);
+  worker2.addJob(update2);
+  worker1.addJob(wait1);
+
+  worker2.addJob(wait2);
+  worker3.addJob(op3);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+
+  LOG(INFO) << "Pass 1 finished";
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+  worker3.addJob(op3);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+
+  worker3.addJob(op3);
+  worker3.addJob(op3);
+  worker1.addJob(update1);
+  worker1.addJob(wait1);
+  worker2.addJob(wait2);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+
+  LOG(INFO) << "Pass 2 finished";
+}
+
+void ParameterServer2Tester::synchronizeTest() {
+  ParameterClient2 client1;
+  ParameterClient2 client2;
+
+  ThreadWorker worker1;
+  ThreadWorker worker2;
+
+  FLAGS_log_period_server = 2;
+
+  auto init1 = [&]() {
+    LOG(INFO) << "init1 start";
+    client1.init(parameters_);
+    client1.setTrainerId(0);
+    LOG(INFO) << "init1 finish";
+  };
+
+  auto init2 = [&]() {
+    LOG(INFO) << "init2 start";
+    client2.init(parameters_);
+    client2.setTrainerId(1);
+    LOG(INFO) << "init2 finish";
+  };
+
+  auto update1 = [&]() {
+    LOG(INFO) << "update1 start";
+    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update1 finish";
+  };
+
+  auto wait1 = [&]() {
+    LOG(INFO) << "wait1 start";
+    client1.asyncFinishPass();
+    LOG(INFO) << "wait1 finish";
+  };
+
+  auto update2 = [&]() {
+    LOG(INFO) << "update2 start";
+    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update2 finish";
+  };
+
+  auto wait2 = [&]() {
+    LOG(INFO) << "wait2 start";
+    client2.asyncFinishPass();
+    LOG(INFO) << "wait2 finish";
+  };
+
+  worker1.addJob(init1);
+  worker2.addJob(init2);
+  // call wait to reset some stats at pserver
+  worker1.addJob(wait1);
+  worker2.addJob(wait2);
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+
+  worker2.addJob(update2);
+  worker2.addJob(update2);
+  worker1.addJob(wait1);
+
+  worker2.addJob(wait2);
+
+  worker1.wait();
+  worker2.wait();
+  LOG(INFO) << "Pass 1 finished";
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+
+  worker1.wait();
+  worker2.wait();
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(wait1);
+  worker2.addJob(wait2);
+
+  worker1.wait();
+  worker2.wait();
+  LOG(INFO) << "Pass 2 finished";
+}
+
+TEST(ParameterServer2, sendParameter) { g_server->sendParameterTest(); }
+
+TEST(ParameterServer2, setConfig) { g_server->setConfigTest(); }
+
+TEST(ParameterServer2, setStatus) { g_server->setStatusTest(); }
+
+TEST(ParameterServer2, operation) { g_server->operationTest(); }
+
+TEST(ParameterServer2, mergeBlockSegment) { g_server->mergeBlockSegmentTest(); }
+
+TEST(ParameterServer2, waitPassFinish) { g_server->waitPassFinishTest(); }
+
+TEST(ParameterServer2, synchronize) { g_server->synchronizeTest(); }
+
+TEST(ParameterServer2, sendData) {
+  // Set gserver and pserver all 3, so that the test is sufficient.
+  int oldFlagsPortsNUm = FLAGS_ports_num;
+  int oldFlagsNumGradientServers = FLAGS_num_gradient_servers;
+  int oldFlagsPort = FLAGS_port;
+  FLAGS_ports_num = 3;
+  FLAGS_num_gradient_servers = 3;
+  FLAGS_port = FLAGS_port + 1;
+  std::unique_ptr<ParameterServer2Tester> g_server1;
+  std::unique_ptr<ParameterServer2Tester> g_server2;
+  std::unique_ptr<ParameterServer2Tester> g_server3;
+  if (FLAGS_rdma_tcp == "rdma") {
+    g_server1.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
+    g_server1->start();
+    g_server2.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port + 1, FLAGS_server_cpu + 1));
+    g_server2->start();
+    g_server3.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port + 2, FLAGS_server_cpu + 2));
+    g_server3->start();
+  } else {  // tcp
+    g_server1.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
+    g_server1->start();
+    g_server2.reset(
+        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 1));
+    g_server2->start();
+    g_server3.reset(
+        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 2));
+    g_server3->start();
+  }
+
+  g_server2->init();
+  g_server3->init();
+  sleep(2);
+  g_server1->setup();
+  g_server1->sendDataTest(DATA_REDUCE_SUM, 1 << 24);
+  sleep(2);
+  g_server1->sendDataTest(DATA_REDUCE_SUM, 2);
+  sleep(2);
+  g_server1.reset();
+  g_server2.reset();
+  g_server3.reset();
+
+  FLAGS_ports_num = oldFlagsPortsNUm;
+  FLAGS_num_gradient_servers = oldFlagsNumGradientServers;
+  FLAGS_port = oldFlagsPort;
+}
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+
+  FLAGS_num_gradient_servers = 2;
+
+  if (FLAGS_rdma_tcp == "rdma") {
+    g_server.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
+  } else {
+    g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
+  }
+
+  g_server->start();
+
+  sleep(2);
+
+  int ret = RUN_ALL_TESTS();
+
+  g_server.reset();
+
+  exit(ret);
+}
diff --git a/paddle/legacy/pserver/test/test_ProtoServer.cpp b/paddle/legacy/pserver/test/test_ProtoServer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8d5e26f995771e0209db95342551d4800bbaa2a3
--- /dev/null
+++ b/paddle/legacy/pserver/test/test_ProtoServer.cpp
@@ -0,0 +1,169 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+#include "ParameterService.pb.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/pserver/ProtoServer.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+
+DEFINE_string(server_addr, "127.0.0.1", "Server address");
+DEFINE_int64(dim, 50000000, "Data size");
+DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
+DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
+
+using namespace paddle;  // NOLINT
+
+class MyServer : public ProtoServer {
+ public:
+  explicit MyServer(int port, int rdmaCpu = -1)
+      : ProtoServer(FLAGS_server_addr, port, rdmaCpu),
+        status_(PSERVER_STATUS_NOT_SET) {
+    REGISTER_SERVICE_FUNCTION(MyServer, getStatus);
+    REGISTER_SERVICE_FUNCTION(MyServer, setStatus);
+    REGISTER_SERVICE_FUNCTION_EX(MyServer, getStatusEx);
+  }
+  void getStatus(const GetStatusRequest& request,
+                 ProtoResponseCallback callback) {
+    (void)request;
+    GetStatusResponse response;
+    response.set_status(status_);
+    callback(response);
+  }
+
+  void getStatusEx(const GetStatusRequest& request,
+                   std::unique_ptr<MsgReader> msgReader,
+                   ProtoResponseCallbackEx callback) {
+    (void)request;
+    GetStatusResponse response;
+    response.set_status(status_);
+    buffer_.resize(msgReader->getNextBlockLength());
+    msgReader->readNextBlock(&buffer_[0]);
+    callback(response, {{&buffer_[0], buffer_.size()}});
+  }
+
+  void setStatus(const SetStatusRequest& request,
+                 ProtoResponseCallback callback) {
+    SetStatusResponse response;
+    status_ = request.status();
+    callback(response);
+  }
+
+ protected:
+  PServerStatus status_;
+  std::string buffer_;
+};
+
+TEST(ProtoServer, regular) {
+  ProtoClient* client;
+  if (FLAGS_rdma_tcp == "rdma")
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
+  else
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
+  {
+    GetStatusRequest request;
+    GetStatusResponse response;
+    auto msgReader = client->sendAndRecv("getStatus", request, &response);
+    EXPECT_EQ(response.status(), PSERVER_STATUS_NOT_SET);
+    EXPECT_EQ(msgReader->getNumBlocks(), (size_t)0);
+  }
+
+  {
+    SetStatusRequest request;
+    SetStatusResponse response;
+    request.set_status(PSERVER_STATUS_PARAMETER_READY);
+    client->sendAndRecv("setStatus", request, &response);
+  }
+
+  {
+    GetStatusRequest request;
+    GetStatusResponse response;
+    client->sendAndRecv("getStatus", request, &response);
+    EXPECT_EQ(response.status(), PSERVER_STATUS_PARAMETER_READY);
+  }
+
+  delete client;
+}
+
+TEST(ProtoServer, extended) {
+#ifdef PADDLE_WITH_CUDA
+  ProtoClient* client;
+  if (FLAGS_rdma_tcp == "rdma")
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
+  else
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
+  int64_t dataSize = FLAGS_dim * sizeof(real);
+
+  GpuVector gpuParam(FLAGS_dim);
+  GpuVector gpuGrad(FLAGS_dim);
+  CpuVector cpuParam(FLAGS_dim);
+  CpuVector cpuGrad(FLAGS_dim);
+
+  gpuParam.rand();
+  gpuGrad.rand();
+  cpuParam.rand();
+  cpuGrad.rand();
+
+  for (int k = 0; k < 4; ++k) {
+    for (int i = 0; i < 10; ++i) {
+      cpuGrad.copyFrom(gpuGrad);
+      if (FLAGS_test_proto_server) {
+        GetStatusRequest request;
+        GetStatusResponse response;
+        {
+          REGISTER_TIMER("sendAndRecv");
+          auto msgReader =
+              client->sendAndRecv("getStatusEx",
+                                  request,
+                                  {{cpuGrad.getData(), (size_t)dataSize}},
+                                  &response);
+
+          EXPECT_EQ(msgReader->getNumBlocks(), (size_t)1);
+          EXPECT_EQ(msgReader->getNextBlockLength(), (size_t)dataSize);
+          msgReader->readNextBlock(cpuParam.getData());
+        }
+        if (!FLAGS_benchmark) {
+          real* v1 = cpuGrad.getData();
+          real* v2 = cpuParam.getData();
+          real sum1 = 0, sum2 = 0;
+          for (int j = 0; j < FLAGS_dim; ++j) {
+            sum1 += v1[j];
+            sum2 += v2[j];
+          }
+          EXPECT_EQ(sum1, sum2);
+        }
+      }
+      gpuParam.copyFrom(cpuParam);
+
+      LOG_EVERY_N(INFO, 10) << "i=" << i;
+    }
+    globalStat.printAllStatus();
+    globalStat.reset();
+  }
+
+  delete client;
+#endif
+}
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  MyServer server(FLAGS_port, FLAGS_rdma_tcp == "rdma" ? 0 : -1);
+  server.start();
+  usleep(10000);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/pserver/test/test_ProtoServer.sh b/paddle/legacy/pserver/test/test_ProtoServer.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1439350847308cc5590329b0fe2a6d2c77d04409
--- /dev/null
+++ b/paddle/legacy/pserver/test/test_ProtoServer.sh
@@ -0,0 +1,33 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+for ((port=12340;port<=12360;port++))
+do
+    port_used_num=`netstat -a |grep $port|wc -l`
+    if [ $port_used_num -eq 0 ]
+    then
+        echo $port;
+        legacy/pserver/test/test_ProtoServer --port=$port
+        if [ $? -eq 0 ]
+           then
+               exit 0
+           else
+               echo "test_ProtoServer run wrong"
+       	       exit 1
+        fi
+fi
+done
+echo "test_ProtoServer port not found"
+exit 1
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
deleted file mode 100644
index 3c897b5f3e09cd53ddd5b767333ce4759250da71..0000000000000000000000000000000000000000
--- a/paddle/math/CMakeLists.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-# common package contains:
-#   * the utilities:
-#       * Thread Libs
-#       * Memory Manage libs
-#       * CommandLine Parser
-#       * Logging
-#       * Timer/Stats
-#   * the math libraries:
-#       * Matrix/Vector
-#   * the parameter optimizers.
-#   * the parameter updater functions.
-#
-# TODO(yuyang18): separate libs.
-#
-file(GLOB MATH_HEADERS . *.h)
-file(GLOB MATH_SOURCES . *.cpp)
-
-if(NOT WITH_MKLDNN)
-    set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
-    set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
-    list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
-    list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
-    message(STATUS "Skip compiling with MKLDNNMatrix")
-else()
-    message(STATUS "Compile with MKLDNNMatrix")
-endif()
-
-if(MOBILE_INFERENCE)
-    # Remove sparse
-    list(REMOVE_ITEM MATH_HEADERS
-         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
-    list(REMOVE_ITEM MATH_SOURCES
-         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
-endif()
-set(MATH_SOURCES
-    "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
-    "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
-    ${MATH_SOURCES})
-if(NOT WITH_GPU)
-    # then compile BaseMatrix.cu as c++ file
-    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu")
-    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu")
-    add_library(paddle_math STATIC
-        ${MATH_SOURCES})
-else()
-    cuda_add_library(paddle_math ${MATH_SOURCES})
-endif()
-
-
-add_dependencies(paddle_math paddle_proto ${external_project_dependencies})  # depends
-if(WITH_TESTING)
-    add_subdirectory(tests)
-endif()
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
deleted file mode 100644
index 023450ffb794086399d7131ba5faa4dbefeaaf7d..0000000000000000000000000000000000000000
--- a/paddle/math/CpuSparseMatrix.cpp
+++ /dev/null
@@ -1,787 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CpuSparseMatrix.h"
-#include "SparseMatrix.h"
-#include "float.h"
-#include "hl_gpu.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-const size_t CpuSparseMatrix::DEFAULT_AVG_WIDTH;
-
-CpuSparseMatrix::CpuSparseMatrix(size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, false) {
-  resize(height, width, nnz, valueType, format);
-}
-
-CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(dataHandle, height, width, trans, false) {
-  resize(height, width, nnz, valueType, format);
-}
-
-CpuSparseMatrix::CpuSparseMatrix(real* data,
-                                 int* rows,
-                                 int* cols,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, false) {
-  cols_ = cols;
-  rows_ = rows;
-  value_ = data;
-  height_ = height;
-  width_ = width;
-  elementCnt_ = nnz;
-  valueType_ = valueType;
-  format_ = format;
-}
-
-void CpuSparseMatrix::resize(size_t newHeight,
-                             size_t newWidth,
-                             size_t newNnz,
-                             SparseValueType valueType,
-                             SparseFormat format) {
-  CHECK_LE(newNnz, newHeight * newWidth);
-  size_t newSize = 0;
-  if (format == SPARSE_CSR) {
-    newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
-  } else {
-    newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
-  }
-
-  if (NO_VALUE != valueType) {
-    newSize += newNnz * sizeof(real);
-  }
-
-  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
-    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newNnz;
-  valueType_ = valueType;
-  format_ = format;
-  sparseResize();
-}
-void CpuSparseMatrix::sparseResize() {
-  if (format_ == SPARSE_CSR) {
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()));
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-        (height_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-  } else {
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()));
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-        (width_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-  }
-}
-
-void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
-  resize(newHeight,
-         newWidth,
-         newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
-         valueType_,
-         format_);
-}
-
-MatrixPtr CpuSparseMatrix::getTranspose() {
-  if (!memoryHandle_ && !value_) {
-    MatrixPtr dest(new CpuSparseMatrix(
-        height_, width_, elementCnt_, valueType_, format_, true));
-    return dest;
-  } else if (memoryHandle_) {
-    MatrixPtr dest(new CpuSparseMatrix(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
-        height_,
-        width_,
-        elementCnt_,
-        valueType_,
-        format_,
-        true));
-    return dest;
-  } else if (value_) {
-    MatrixPtr dest(new CpuSparseMatrix(value_,
-                                       rows_,
-                                       cols_,
-                                       height_,
-                                       width_,
-                                       elementCnt_,
-                                       valueType_,
-                                       format_,
-                                       true));
-    return dest;
-  } else {
-    return NULL;
-  }
-}
-
-SparseValueType CpuSparseMatrix::getValueType() { return valueType_; }
-
-void CpuSparseMatrix::mul(const Matrix& a,
-                          const Matrix& b,
-                          real scaleAB,
-                          real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    CpuMatrix::mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, this, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::add3(CpuMatrix* b) {
-  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK(height_ == b->getHeight());
-  CHECK(width_ == b->getWidth());
-  real* A = getValue();
-  real* B = b->getData();
-  int* cols = getCols();
-  for (size_t i = 0; i < height_; i++) {
-    size_t start = getRowStartIdx(i);
-    size_t end = getRowStartIdx(i + 1);
-    for (size_t j = start; j < end; j++) {
-      A[j] = B[i * width_ + cols[j]];
-    }
-  }
-}
-
-void CpuSparseMatrix::add3(MatrixPtr b) {
-  if (dynamic_cast<CpuMatrix*>(b.get())) {
-    add3(dynamic_cast<CpuMatrix*>(b.get()));
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::addBias(Matrix& b, real scale) {
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  CHECK_EQ(width_, b.getWidth());
-  real* A = getValue();
-  real* B = b.getData();
-  int* cols = getCols();
-  size_t nnz = getElementCnt();
-  for (size_t i = 0; i < nnz; i++) {
-    A[i] += scale * B[cols[i]];
-  }
-}
-
-template <class T>
-void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
-  os << "\n: " << name << " [";
-  for (size_t i = 0; i < len; i++) {
-    os << a[i] << " ";
-  }
-  os << "]\n";
-}
-
-void CpuSparseMatrix::print(std::ostream& os) const {
-  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
-  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
-  printBuf(os, rows_, rowSize, "row");
-  printBuf(os, cols_, colSize, "col");
-  if (valueType_ == FLOAT_VALUE) {
-    printBuf(os, value_, elementCnt_, "value");
-  }
-  return;
-}
-
-void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, height_);
-  if (format_ == SPARSE_CSC) {
-    LOG(FATAL) << "SPARSE_CSC not supported";
-    return;
-  }
-
-  const int* col = getRowCols(idx);
-  size_t num = getColNum(idx);
-  if (num > 0) {
-    if (valueType_ == FLOAT_VALUE) {
-      const real* data = getRowValues(idx);
-      os << col[0] << ":" << data[0];
-      for (size_t i = 1; i < num; ++i) {
-        os << " " << col[i] << ":" << data[i];
-      }
-    } else {
-      os << col[0];
-      for (size_t i = 1; i < num; ++i) {
-        os << " " << col[i];
-      }
-    }
-  }
-  os << ";";
-}
-
-void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
-  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK_EQ(height_, b.getHeight());
-  CHECK_EQ(width_, b.getWidth());
-  real* A = getValue();
-  real* B = b.getValue();
-  if (b.getValueType() == FLOAT_VALUE) {
-    for (size_t i = 0; i < height_; i++) {
-      size_t start = getRowStartIdx(i);
-      size_t end = getRowStartIdx(i + 1);
-      CHECK_EQ(start, b.getRowStartIdx(i));
-      CHECK_EQ(end, b.getRowStartIdx(i + 1));
-      for (size_t j = start; j < end; j++) {
-        A[j] = B[j] * c.getElement(i, cCol);
-      }
-    }
-  } else if (b.getValueType() == NO_VALUE) {
-    for (size_t i = 0; i < height_; i++) {
-      size_t start = getRowStartIdx(i);
-      size_t end = getRowStartIdx(i + 1);
-      CHECK_EQ(start, b.getRowStartIdx(i));
-      CHECK_EQ(end, b.getRowStartIdx(i + 1));
-      for (size_t j = start; j < end; j++) {
-        A[j] = c.getElement(i, cCol);
-      }
-    }
-  }
-}
-
-void CpuSparseMatrix::randomizeUniform() {
-  CHECK_LE(elementCnt_, height_ * width_);
-  if (valueType_ == FLOAT_VALUE) {
-    real* data = getValue();
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      *data++ = rand() / static_cast<real>(RAND_MAX);  // NOLINT
-    }
-  }
-  if (format_ == SPARSE_CSR) {
-    sparseRand(rows_, cols_, elementCnt_, height_ + 1, width_, false);
-  } else {
-    sparseRand(cols_, rows_, elementCnt_, width_ + 1, height_, false);
-  }
-}
-
-void CpuSparseMatrix::copyFrom(std::vector<int>& rows,
-                               std::vector<int>& cols,
-                               std::vector<real>& values) {
-  size_t size = format_ == SPARSE_CSR ? cols.size() : rows.size();
-  resize(height_, width_, size, valueType_, format_);
-  if (valueType_ == FLOAT_VALUE) {
-    memcpy(&value_[0], &values[0], sizeof(real) * values.size());
-  }
-  memcpy(&cols_[0], &cols[0], sizeof(int) * cols.size());
-  memcpy(&rows_[0], &rows[0], sizeof(int) * rows.size());
-}
-
-// Copy from a CpuMatrix, only supported in sparse_float_value_t
-// SparseMatrix.
-void CpuSparseMatrix::copyFrom(const CpuMatrix& src) {
-  CHECK_EQ(getHeight(), src.getHeight());
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK(!src.trans_ && !trans_);
-  if (format_ == SPARSE_CSR) {
-    std::vector<int> rows(getHeight() + 1);
-    std::vector<int> cols;
-    std::vector<real> values;
-    rows[0] = 0;
-    for (size_t r = 0; r < getHeight(); ++r) {
-      for (size_t c = 0; c < getWidth(); ++c) {
-        real v = src.getElement(r, c);
-        if (fabs(v) > FLT_EPSILON) {
-          cols.push_back(c);
-          values.push_back(v);
-        }
-      }
-      rows[r + 1] = values.size();
-    }
-    copyFrom(rows, cols, values);
-  } else {
-    std::vector<int> cols(getWidth() + 1);
-    std::vector<int> rows;
-    std::vector<real> values;
-    cols[0] = 0;
-    for (size_t r = 0; r < getWidth(); ++r) {
-      for (size_t c = 0; c < getHeight(); ++c) {
-        real v = src.getElement(c, r);
-        if (fabs(v) > FLT_EPSILON) {
-          rows.push_back(c);
-          values.push_back(v);
-        }
-      }
-      cols[r + 1] = values.size();
-    }
-    copyFrom(rows, cols, values);
-  }
-}
-
-MatrixPtr CpuSparseMatrix::clone(size_t height, size_t width, bool useGpu) {
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-  CHECK(width && height);
-  if (!useGpu) {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, 0, valueType_, format_);
-  } else {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, elementCnt_, valueType_, format_);
-  }
-}
-
-MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
-  CHECK_LE(startRow + numRows, height_);
-  CHECK_EQ(format_, SPARSE_CSR);
-  if (valueType_ == NO_VALUE) {
-    return std::make_shared<CpuSparseMatrix>(
-        nullptr,
-        rows_ + startRow,
-        cols_,
-        numRows,
-        width_,
-        rows_[startRow + numRows] - rows_[startRow],
-        valueType_,
-        format_,
-        trans_);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        value_,
-        rows_ + startRow,
-        cols_,
-        numRows,
-        width_,
-        rows_[startRow + numRows] - rows_[startRow],
-        valueType_,
-        format_,
-        trans_);
-  }
-}
-
-/* mem MUST be alloced outside (memAlloc=false) */
-void CpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  CHECK(!memAlloc);
-  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(matTrans.get());
-  if (format_ == SPARSE_CSR) {
-    /*statistic element number in each col*/
-    int* colCounters = mat->getRows() + 1;
-    memset(colCounters, 0, sizeof(int) * width_);
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      int col = cols_[i];
-      colCounters[col]++;
-    }
-    /*fill mat rows */
-    mat->getRows()[0] = 0;
-    for (size_t i = 1; i < width_ + 1; i++) {
-      mat->getRows()[i] = mat->getRows()[i - 1] + mat->getRows()[i];
-    }
-    /*fill mat values and cols*/
-    std::vector<int> colNumVec(width_, 0);
-    if (valueType_ == FLOAT_VALUE) {
-      for (size_t i = 0; i < height_; i++) {
-        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
-          int colIdx = cols_[j];
-          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
-          mat->getCols()[index] = i;
-          mat->getValue()[index] = value_[j];
-          colNumVec[colIdx]++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height_; i++) {
-        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
-          int colIdx = cols_[j];
-          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
-          mat->getCols()[index] = i;
-          colNumVec[colIdx]++;
-        }
-      }
-    }
-  } else {
-    /*statistic element number in each row*/
-    int* rowCounters = mat->getCols() + 1;
-    memset(rowCounters, 0, sizeof(int) * height_);
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      int row = rows_[i];
-      rowCounters[row]++;
-    }
-
-    /*fill mat cols */
-    mat->getCols()[0] = 0;
-    for (size_t i = 1; i < height_ + 1; i++) {
-      mat->getCols()[i] = mat->getCols()[i - 1] + mat->getCols()[i];
-    }
-    /*fill mat values and rows*/
-    std::vector<int> rowNumVec(height_, 0);
-    if (valueType_ == FLOAT_VALUE) {
-      for (size_t i = 0; i < width_; i++) {
-        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
-          int rowIdx = rows_[j];
-          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
-          mat->getRows()[index] = i;
-          mat->getValue()[index] = value_[j];
-          rowNumVec[rowIdx]++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i < width_; i++) {
-        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
-          int rowIdx = rows_[j];
-          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
-          mat->getRows()[index] = i;
-          rowNumVec[rowIdx]++;
-        }
-      }
-    }
-  }
-}
-
-void CpuSparseMatrix::setRow(size_t row,
-                             size_t colNum,
-                             const unsigned int* cols,
-                             const real* values) {
-  if (format_ == SPARSE_CSR) {
-    CHECK_LT(row, height_);
-    CHECK(NULL != cols);
-    if (0 == row) {
-      rows_[row] = 0;
-    }
-    rows_[row + 1] = rows_[row] + colNum;
-    for (size_t i = 0; i < colNum; ++i) {
-      cols_[rows_[row] + i] = cols[i];
-    }
-    if (valueType_ == NO_VALUE) {
-      CHECK(!values);
-    } else {
-      for (size_t i = 0; i < colNum; ++i) {
-        value_[rows_[row] + i] = values[i];
-      }
-    }
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::fillRowIndices(IVectorPtr& outVec) const {
-  if (format_ == SPARSE_CSR) {
-    auto nnz = getElementCnt();
-    IVector::resizeOrCreate(outVec, nnz, false);
-    auto out = outVec->getData();
-    int* rows = getRows();
-    for (size_t i = 0; i < height_; i++) {
-      for (int j = rows[i]; j < rows[i + 1]; j++) {
-        out[j] = i;
-      }
-    }
-  } else {
-    LOG(FATAL) << "SPARSE_CSC not supported";
-  }
-}
-
-ThreadLocal<std::vector<CpuSparseMatrixPtr>> CpuSparseMatrix::cpuLocalMats_;
-
-CpuSparseMatrixPtr CpuSparseMatrix::getTmpSparseMatrix(size_t height,
-                                                       size_t width) {
-  std::vector<CpuSparseMatrixPtr>* localMats = cpuLocalMats_.get();
-  auto it = localMats->begin();
-  while (it != localMats->end()) {
-    if (it->unique()) {
-      (*it)->resize(height, width, elementCnt_, valueType_, format_);
-      return *it;
-    }
-  }
-  localMats->emplace_back(std::make_shared<CpuSparseMatrix>(
-      height, width, elementCnt_, valueType_, format_, false));
-  return localMats->back();
-}
-
-void CpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  if (dynamic_cast<const GpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const GpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc, stream);
-  } else if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else {
-    LOG(FATAL) << "not implemented";
-  }
-}
-
-void CpuSparseMatrix::copyFrom(const Matrix& src) {
-  if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else {
-    LOG(FATAL) << "not implemented";
-  }
-}
-
-void CpuSparseMatrix::copyFrom(const GpuSparseMatrix& src, hl_stream_t stream) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_EQ(width_, src.getWidth());
-  CHECK_EQ(size_t(elementCnt_), src.getElementCnt());
-  size_t valSize = valueType_ == NO_VALUE ? 0 : elementCnt_;
-  if (format_ == SPARSE_CSC)
-    hl_memcpy_from_csc_matrix(value_,
-                              valSize,
-                              rows_,
-                              elementCnt_,
-                              cols_,
-                              width_ + 1,
-                              src.sMatrix_.get(),
-                              stream);
-  else
-    hl_memcpy_from_csr_matrix(value_,
-                              valSize,
-                              rows_,
-                              height_ + 1,
-                              cols_,
-                              elementCnt_,
-                              src.sMatrix_.get(),
-                              stream);
-}
-
-void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_EQ(width_, src.getWidth());
-  CHECK_EQ(format_, src.getFormat());
-  int start = format_ == SPARSE_CSR ? src.getRows()[0] : src.getCols()[0];
-  if (format_ == SPARSE_CSR) {
-    size_t totalColNum = 0;
-    for (size_t i = 0; i < height_; ++i) {
-      totalColNum += src.getColNum(i);
-    }
-    resize(height_, width_, totalColNum, valueType_, format_);
-    rows_[0] = 0;
-    for (size_t i = 0; i < height_; ++i) {
-      rows_[i + 1] = rows_[i] + src.getColNum(i);
-    }
-    memcpy(cols_, src.getCols() + start, totalColNum * sizeof(int));
-  } else {
-    size_t totalColNum = 0;
-    for (size_t i = 0; i < width_; ++i) {
-      totalColNum += src.getRowNum(i);
-    }
-    resize(height_, width_, totalColNum, valueType_, format_);
-    cols_[0] = 0;
-    for (size_t i = 0; i < width_; ++i) {
-      cols_[i + 1] = cols_[i] + src.getRowNum(i);
-    }
-    memcpy(rows_, src.getRows() + start, totalColNum * sizeof(int));
-  }
-
-  // if have different value type, only copy rows and cols
-  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
-    memcpy(value_, src.getValue() + start, elementCnt_ * sizeof(real));
-  }
-}
-
-void CpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_non_value_t* row) {
-  for (size_t j = 0; j < colNum; j++) {
-    cols_[offsets + j] = row[j].col;
-  }
-}
-
-void CpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_float_value_t* row) {
-  for (size_t j = 0; j < colNum; j++) {
-    cols_[offsets + j] = row[j].col;
-    value_[offsets + j] = row[j].value;
-  }
-}
-
-template <class T>
-void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data) {
-  size_t totalColNum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    int64_t id = ids[i];
-    totalColNum += indices[id + 1] - indices[id];
-  }
-  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
-
-  resize(height_, width_, totalColNum, valueType_, format_);
-
-  rows_[0] = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    int64_t id = ids[i];
-    T* row = data + indices[id];
-    size_t colNum = indices[id + 1] - indices[id];
-    rows_[i + 1] = rows_[i] + colNum;
-    copyRow(rows_[i], colNum, row);
-  }
-}
-
-template <class T>
-void CpuSparseMatrix::copyFrom(int64_t* indices, T* data) {
-  CHECK(format_ == SPARSE_CSR);
-  size_t totalColNum = indices[height_] - indices[0];
-  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
-  resize(height_, width_, totalColNum, valueType_, format_);
-
-  rows_[0] = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    T* row = data + indices[i];
-    size_t colNum = indices[i + 1] - indices[i];
-    rows_[i + 1] = rows_[i] + colNum;
-    copyRow(rows_[i], colNum, row);
-  }
-}
-
-void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_LE(width_, src.getWidth());
-  CHECK_EQ(format_, src.getFormat());
-  CHECK_EQ(valueType_, src.getValueType());
-  if (format_ == SPARSE_CSR) {
-    int* srcCols = src.getCols();
-    size_t numLessWidth =
-        std::count_if(srcCols, srcCols + src.getElementCnt(), [this](size_t n) {
-          return n < this->width_;
-        });
-    resize(height_, width_, numLessWidth, valueType_, format_);
-    rows_[0] = 0;
-    size_t index = 0;
-    for (size_t r = 0; r < height_; ++r) {
-      for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
-        if (srcCols[i] < static_cast<int>(width_)) {
-          cols_[index] = srcCols[i];
-          if (valueType_ == FLOAT_VALUE) {
-            value_[index] = src.getValue()[i];
-          }
-          ++index;
-        }
-      }
-      rows_[r + 1] = index;
-    }
-    CHECK_EQ(index, numLessWidth);
-  } else {
-    size_t numLessWidth = src.getCols()[width_] - src.getCols()[0];
-    resize(height_, width_, numLessWidth, valueType_, format_);
-    cols_[0] = 0;
-    size_t index = 0;
-    // note: c < width_, not src.getWidth();
-    for (size_t c = 0; c < width_; ++c) {
-      for (int i = src.getCols()[c]; i < src.getCols()[c + 1]; ++i) {
-        rows_[index] = src.getRows()[i];
-        if (valueType_ == FLOAT_VALUE) {
-          value_[index] = src.getValue()[i];
-        }
-        ++index;
-      }
-      cols_[c + 1] = index;
-    }
-    CHECK_EQ(index, numLessWidth);
-  }
-}
-
-void CpuSparseMatrix::zeroMem() {
-  CHECK(valueType_ == FLOAT_VALUE);
-  memset(value_, 0, elementCnt_ * sizeof(real));
-}
-
-template void CpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_non_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_float_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* indices,
-                                        sparse_non_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* indices,
-                                        sparse_float_value_t* data);
-
-void CpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  maxVal.zeroMem();
-  int* outids = maxIds.getData();
-  real* outvalues = maxVal.getData();
-
-  typedef std::pair<real, size_t> valuepair;
-  std::vector<valuepair> vec;
-  for (size_t i = 0; i < numSamples; i++) {
-    vec.clear();
-
-    auto num = getColNum(i);
-    auto ids = getRowCols(i);
-    auto values = getRowValues(i);
-    for (size_t j = 0; j < num; j++) {
-      vec.push_back(std::make_pair(values[j], ids[j]));
-    }
-
-    size_t outsize = std::min(num, beam);
-    std::partial_sort(vec.begin(),
-                      vec.begin() + outsize,
-                      vec.end(),
-                      [](const valuepair& a, const valuepair& b) {
-                        return a.first > b.first;
-                      });
-    for (size_t j = 0; j < outsize; j++) {
-      outids[i * beam + j] = vec[j].second;
-      outvalues[i * beam + j] = vec[j].first;
-    }
-    if (outsize < beam) {
-      // if the number of values to sort are less than the output size,
-      // use -1 to indicate the end of valid sorted values.
-      outids[i * beam + outsize] = -1;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
deleted file mode 100644
index d4a78f3e54b73add3c00e17f13d91359839d3d14..0000000000000000000000000000000000000000
--- a/paddle/math/MKLDNNMatrix.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Matrix.h"
-#include "mkldnn.hpp"
-#include "paddle/parameter/Parameter.h"
-
-namespace paddle {
-
-class MKLDNNMatrix;
-typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
-
-#define CHECK_PRIMITIVE_DESC_EQ(MAT, PD, ...)                        \
-  CHECK(MAT) << " can not be empty.";                                \
-  CHECK(MAT->getPrimitiveDesc() == PD)                               \
-      << #MAT "->getPrimitiveDesc() and " #PD " should be equal.\n " \
-      << "" __VA_ARGS__;
-
-/**
- * @brief MKLDNN Matrix.
- *
- */
-class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
- public:
-  MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
-      : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
-        mkldnn::memory(pd, m->getData()),
-        m_(m) {}
-
-  ~MKLDNNMatrix() {}
-
-  /**
-   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
-   */
-  static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
-                                MatrixPtr m = nullptr);
-
-  /**
-   * Create MKLDNNMatrix from a MatrixPtr and memory details info
-   */
-  static MKLDNNMatrixPtr create(
-      mkldnn::memory::dims dims,
-      mkldnn::memory::format fmt,
-      mkldnn::engine& eg,
-      MatrixPtr m = nullptr,
-      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
-
-  /**
-   * Create primitive descriptor.
-   * default with f32 dtype
-   */
-  static mkldnn::memory::primitive_desc createPrimitiveDesc(
-      const mkldnn::memory::dims dims,
-      const mkldnn::memory::format& fmt,
-      const mkldnn::engine& eg,
-      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
-    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
-  }
-
-  /**
-   * Create Memory descriptor.
-   * default with any format and f32 dtype
-   */
-  static mkldnn::memory::desc createMemoryDesc(
-      const mkldnn::memory::dims dims,
-      const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
-      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
-    return mkldnn::memory::desc(dims, dtype, fmt);
-  }
-
-  /**
-   * Create reorder primitive.
-   * Create a mkldnn::reorder handle for converting src MKLDNNMatrix to dst.
-   * checkData: whether to check the data handle of src and dst.
-   *            if true, it will check the data and do not allow them equal;
-   *            otherwise, it will not check them, then the reorder created
-   *            may have inplace buffer.
-   *            Do not set false, if you can not guarantee the inplace logical
-   *            would work with your reorder.
-   */
-  static std::shared_ptr<mkldnn::reorder> createReorder(
-      const MKLDNNMatrixPtr& src,
-      const MKLDNNMatrixPtr& dst,
-      bool checkData = true);
-
-  void copyFrom(const Matrix& src) {
-    // TODO(TJ): reorder data if this format is not nchw or x
-    m_->copyFrom(src);
-  }
-
-  void copyTo(Matrix& dst) {
-    // TODO(TJ): reorder data if this format is not nchw or x
-    dst.copyFrom(*m_);
-  }
-
- public:
-  /**
-   * Reorder this MKLDNNMatrix from other format.
-   * Support inplace reorder.
-   * @note: this function would only reorder the data layout.
-   *        will NOT change this original dim or format info
-   */
-  void reorderDataFrom(const MKLDNNMatrixPtr& m,
-                       memory::format srcFmt,
-                       memory::dims targetDim);
-
-  /**
-   * Reorder this MKLDNNMatrix to other format.
-   * Support inplace reorder.
-   * @note: this function would only reorder the data layout.
-   *        will NOT change the dst dim or format info
-   */
-  void reorderDataTo(const MKLDNNMatrixPtr& m,
-                     memory::format dstFmt,
-                     memory::dims targetDim);
-
-  /**
-   * Dimensionality reduction.
-   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
-   */
-  void downSpatial();
-
-  /**
-   * set the memory data handle.
-   * Caution: This will not check the buffer size of the data,
-   *          it should be coverd by user.
-   */
-  void setData(real* data) {
-    set_data_handle(data);
-    CpuMatrix::setData(data);
-    m_.reset();
-  }
-
-  /**
-   * override the CpuMatrix::resize
-   */
-  void resize(size_t newHeight, size_t newWidth) override {
-    m_->resize(newHeight, newWidth);
-    if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
-      return;
-    }
-    CpuMatrix::setData(data_);
-    height_ = newHeight;
-    width_ = newWidth;
-    elementCnt_ = newHeight * newWidth;
-    stride_ = width_;
-    auto pd = mkldnn::memory::primitive_desc(
-        mkldnn::memory::desc({(int)newHeight, (int)newWidth},
-                             getDtype(),
-                             mkldnn::memory::format::nc),
-        getEngine());
-    resetMKLDNNMemory(pd, data_);
-  }
-
-  /**
-   * override Matrix::getData
-   * check data before return
-   */
-  real* getData() override {
-    CHECK_EQ((void*)data_, get_data_handle());
-    return data_;
-  }
-
-  const real* getData() const override {
-    CHECK_EQ((void*)data_, get_data_handle());
-    return data_;
-  }
-
-  /**
-   * Get primitive descriptor.
-   */
-  mkldnn::memory::primitive_desc getPrimitiveDesc() {
-    return this->get_primitive_desc();
-  }
-
-  /**
-   * Get memory descriptor.
-   */
-  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
-
-  /**
-   * Get dimensions.
-   */
-  mkldnn::memory::dims getDims() {
-    mkldnn::memory::desc md = getMemoryDesc();
-    const int* src = md.data.dims;
-    int ndims = md.data.ndims;
-    mkldnn::memory::dims dst;
-    dst.resize(ndims);
-    for (int i = 0; i < ndims; ++i) {
-      dst[i] = src[i];
-    }
-    return dst;
-  }
-
-  /**
-   * Get format.
-   */
-  mkldnn::memory::format getFormat() {
-    return (mkldnn::memory::format)(getMemoryDesc().data.format);
-  }
-
-  /**
-   * Get memory data type.
-   */
-  mkldnn::memory::data_type getDtype() {
-    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
-  }
-
-  /**
-   * Get engine.
-   */
-  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
-
- protected:
-  /**
-   * Do reorder once.
-   * Can support inplace.
-   */
-  void reorderOnce(void* srcData,
-                   void* dstData,
-                   memory::format srcFmt,
-                   memory::format dstFmt,
-                   memory::dims dm);
-  /**
-   * reset this MKLDNN Memory from primitve desc
-   */
-  void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
-    mkldnn_primitive_t result;
-    mkldnn::error::wrap_c_api(
-        mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
-        "could not create a memory primitive");
-    reset(result);
-    set_data_handle(data);
-  }
-
- private:
-  // save the CpuMatrixPtr in case the buffer released outside
-  CpuMatrixPtr m_;
-};
-
-}  // namespace paddle
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
deleted file mode 100644
index f48119aa511578b21602a225277f01b4c6a9e9a8..0000000000000000000000000000000000000000
--- a/paddle/math/MathFunctions.cpp
+++ /dev/null
@@ -1,348 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/math/MathFunctions.h"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_ops.cuh"
-#include "paddle/utils/DynamicLoader.h"
-
-namespace dynload {
-
-std::once_flag lapack_dso_flag;
-void* lapack_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load lapack routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-
-// The argument for stringizing operator is not macro-expanded first.
-// We have to use two levels of macro to do the expansion.
-// See https://gcc.gnu.org/onlinedocs/cpp/Stringizing.html
-#define STR(x) #x
-
-// clang-format off
-#ifndef LAPACK_FOUND
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      using lapack_func = decltype(__name(args...)) (*)(Args...);              \
-      std::call_once(lapack_dso_flag, GetLapackDsoHandle, &lapack_dso_handle); \
-      void* p_##__name = dlsym(lapack_dso_handle, STR(__name));                \
-      CHECK(p_##__name) << "Cannot find symbol " << STR(__name)                \
-                        << " in liblapack.so";                                 \
-      return reinterpret_cast<lapack_func>(p_##__name)(args...);               \
-    }                                                                          \
-  } __name;  // struct DynLoad__##__name
-#else
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      return __name(args...);                                                  \
-    }                                                                          \
-  } __name;  // struct DynLoad__##__name
-#endif
-
-#define  PADDLE_SGETRF  LAPACKE_sgetrf
-#define  PADDLE_DGETRF  LAPACKE_dgetrf
-#define  PADDLE_SGETRI  LAPACKE_sgetri
-#define  PADDLE_DGETRI  LAPACKE_dgetri
-
-#define LAPACK_ROUTINE_EACH(__macro)       \
-  __macro(PADDLE_SGETRF)                   \
-  __macro(PADDLE_DGETRF)                   \
-  __macro(PADDLE_SGETRI)                   \
-  __macro(PADDLE_DGETRI)
-// clang-format on
-
-LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
-
-}  // namespace dynload
-
-namespace paddle {
-
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-template <>
-void gemm<float>(const CBLAS_TRANSPOSE transA,
-                 const CBLAS_TRANSPOSE transB,
-                 const int M,
-                 const int N,
-                 const int K,
-                 const float alpha,
-                 const float* A,
-                 const int lda,
-                 const float* B,
-                 const int ldb,
-                 const float beta,
-                 float* C,
-                 const int ldc) {
-  cblas_sgemm(CblasRowMajor,
-              transA,
-              transB,
-              M,
-              N,
-              K,
-              alpha,
-              A,
-              lda,
-              B,
-              ldb,
-              beta,
-              C,
-              ldc);
-}
-
-template <>
-void gemm<double>(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE transB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const double alpha,
-                  const double* A,
-                  const int lda,
-                  const double* B,
-                  const int ldb,
-                  const double beta,
-                  double* C,
-                  const int ldc) {
-  cblas_dgemm(CblasRowMajor,
-              transA,
-              transB,
-              M,
-              N,
-              K,
-              alpha,
-              A,
-              lda,
-              B,
-              ldb,
-              beta,
-              C,
-              ldc);
-}
-#endif
-
-template <>
-int getrf<float>(const CBLAS_ORDER order,
-                 const int M,
-                 const int N,
-                 float* A,
-                 const int lda,
-                 int* ipiv) {
-  return dynload::PADDLE_SGETRF(order, M, N, A, lda, ipiv);
-}
-
-template <>
-int getrf<double>(const CBLAS_ORDER order,
-                  const int M,
-                  const int N,
-                  double* A,
-                  const int lda,
-                  int* ipiv) {
-  return dynload::PADDLE_DGETRF(order, M, N, A, lda, ipiv);
-}
-
-template <>
-int getri<float>(const CBLAS_ORDER order,
-                 const int N,
-                 float* A,
-                 const int lda,
-                 const int* ipiv) {
-  return dynload::PADDLE_SGETRI(order, N, A, lda, ipiv);
-}
-
-template <>
-int getri<double>(const CBLAS_ORDER order,
-                  const int N,
-                  double* A,
-                  const int lda,
-                  const int* ipiv) {
-  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
-}
-
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-template <>
-void axpy<float>(const int n, const float alpha, const float* x, float* y) {
-  cblas_saxpy(n, alpha, x, 1, y, 1);
-}
-
-template <>
-void axpy<double>(const int n, const double alpha, const double* x, double* y) {
-  cblas_daxpy(n, alpha, x, 1, y, 1);
-}
-
-template <>
-float dotProduct<float>(const int n, const float* x, const float* y) {
-  return cblas_sdot(n, x, 1, y, 1);
-}
-
-template <>
-double dotProduct<double>(const int n, const double* x, const double* y) {
-  return cblas_ddot(n, x, 1, y, 1);
-}
-#endif
-
-#if defined(PADDLE_WITH_MKLML)
-
-template <>
-void vExp<float>(const int n, const float* a, float* r) {
-  vsExp(n, a, r);
-}
-
-template <>
-void vExp<double>(const int n, const double* a, double* r) {
-  vdExp(n, a, r);
-}
-
-template <>
-void vPow<float>(const int n, const float* a, const float b, float* r) {
-  vsPowx(n, a, b, r);
-}
-
-template <>
-void vPow<double>(const int n, const double* a, const double b, double* r) {
-  vdPowx(n, a, b, r);
-}
-
-template <>
-void vLog<float>(const int n, const float* a, float* r) {
-  vsLn(n, a, r);
-}
-
-template <>
-void vLog<double>(const int n, const double* a, double* r) {
-  vdLn(n, a, r);
-}
-
-template <>
-void vAdd<float>(const int n, const float* a, const float* b, float* r) {
-  vsAdd(n, a, b, r);
-}
-
-template <>
-void vAdd<double>(const int n, const double* a, const double* b, double* r) {
-  vdAdd(n, a, b, r);
-}
-
-template <>
-void vTanh<float>(const int n, const float* a, float* r) {
-  vsTanh(n, a, r);
-}
-
-template <>
-void vTanh<double>(const int n, const double* a, double* r) {
-  vdTanh(n, a, r);
-}
-
-template <>
-void vInvSqrt<float>(const int n, const float* a, float* r) {
-  vsInvSqrt(n, a, r);
-}
-
-template <>
-void vInvSqrt<double>(const int n, const double* a, double* r) {
-  vdInvSqrt(n, a, r);
-}
-
-template <>
-void vLog1p<float>(const int n, const float* a, float* r) {
-  vsLog1p(n, a, r);
-}
-
-template <>
-void vLog1p<double>(const int n, const double* a, double* r) {
-  vdLog1p(n, a, r);
-}
-#else
-
-DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
-template <class T>
-void vExp(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
-      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
-template <class T>
-void vLog(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
-      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
-template <class T>
-void vPow(const int n, const T* a, const T b, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
-      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
-template <class T>
-void vAdd(const int n, const T* a, const T* b, T* r) {
-  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
-                                                     const_cast<T*>(a),
-                                                     const_cast<T*>(b),
-                                                     r,
-                                                     1,
-                                                     n,
-                                                     n,
-                                                     n,
-                                                     n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
-template <class T>
-void vInvSqrt(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vInvSqrt<T>, 0, 0>(
-      binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a));
-template <class T>
-void vLog1p(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog1p<T>, 0, 0>(
-      binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vTanh, T tmp = -2.0 * a;
-                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template <class T>
-void vTanh(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
-      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-template void vExp(const int n, const float* a, float* r);
-template void vExp(const int n, const double* a, double* r);
-template void vLog(const int n, const float* a, float* r);
-template void vLog(const int n, const double* a, double* r);
-template void vPow(const int n, const float* a, const float b, float* r);
-template void vPow(const int n, const double* a, const double b, double* r);
-template void vAdd(const int n, const float* a, const float* b, float* r);
-template void vAdd(const int n, const double* a, const double* b, double* r);
-template void vInvSqrt(const int n, const double* a, double* r);
-template void vInvSqrt(const int n, const float* a, float* r);
-template void vLog1p(const int n, const float* a, float* r);
-template void vLog1p(const int n, const double* a, double* r);
-template void vTanh(const int n, const float* a, float* r);
-template void vTanh(const int n, const double* a, double* r);
-#endif
-}  // namespace paddle
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
deleted file mode 100644
index bcd6dfe1fda6b1243007b0c26a6e0087eedcc10c..0000000000000000000000000000000000000000
--- a/paddle/math/Matrix.cpp
+++ /dev/null
@@ -1,4787 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Matrix.h"
-#include "MathFunctions.h"
-#include "SparseMatrix.h"
-#include "SparseRowMatrix.h"
-
-#include <float.h>
-#include <algorithm>
-#include <cmath>
-
-#include <string.h>
-#include "hl_cnn.h"
-#include "hl_gpu.h"
-#include "hl_table_apply.h"
-#include "hl_top_k.h"
-#include "paddle/utils/Logging.h"
-
-#include "NEONFunctions.h"
-#include "paddle/function/GemmFunctor.h"
-#include "paddle/utils/ThreadLocal.h"
-
-#include "SIMDFunctions.h"
-
-namespace paddle {
-
-inline real _pow(real a, real beta) { return std::pow(a, beta); }
-
-inline real _square(real a) { return a * a; }
-
-inline real _safelog(real a) { return a > 0.0f ? std::log(a) : -40.0f; }
-
-Matrix::Matrix(MemoryHandlePtr memHandle,
-               size_t height,
-               size_t width,
-               bool trans,
-               bool use_gpu)
-    : BaseMatrix(
-          height,
-          width,
-          memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
-          trans,
-          use_gpu) {
-  elementCnt_ = width * height;
-  memoryHandle_ = memHandle;
-}
-
-Matrix::Matrix(
-    real* data, size_t height, size_t width, bool trans, bool use_gpu)
-    : BaseMatrix(height, width, data, trans, use_gpu) {
-  elementCnt_ = width * height;
-}
-
-Matrix::Matrix(real* data,
-               size_t height,
-               size_t width,
-               size_t stride,
-               bool trans,
-               bool use_gpu)
-    : BaseMatrix(height, width, stride, data, trans, use_gpu) {
-  elementCnt_ = width * height;
-}
-
-MatrixPtr Matrix::createSparseMatrix(real* data,
-                                     int* row,
-                                     int* col,
-                                     size_t height,
-                                     size_t width,
-                                     size_t nnz, /* used to allocate space */
-                                     SparseValueType valueType, /*value type*/
-                                     SparseFormat format,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        data, row, col, height, width, nnz, valueType, format, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        data, row, col, height, width, nnz, valueType, format, trans);
-  }
-}
-
-MatrixPtr Matrix::createSparseMatrix(size_t height,
-                                     size_t width,
-                                     size_t nnz, /* used to allocate space */
-                                     SparseValueType valueType, /*value type*/
-                                     SparseFormat format,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, nnz, valueType, format, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, nnz, valueType, format, trans);
-  }
-}
-
-MatrixPtr Matrix::create(MemoryHandlePtr memHandle,
-                         size_t height,
-                         size_t width,
-                         bool trans) {
-  if (auto gpuHandle = std::dynamic_pointer_cast<GpuMemoryHandle>(memHandle)) {
-    return std::make_shared<GpuMatrix>(gpuHandle, height, width, trans);
-  } else if (auto cpuHandle =
-                 std::dynamic_pointer_cast<CpuMemoryHandle>(memHandle)) {
-    return std::make_shared<CpuMatrix>(cpuHandle, height, width, trans);
-  } else {
-    LOG(FATAL) << "Wrong";
-    return nullptr;
-  }
-}
-
-MatrixPtr Matrix::create(size_t height, size_t width, bool trans, bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width, trans);
-  }
-}
-
-MatrixPtr Matrix::create(
-    real* data, size_t height, size_t width, bool trans, bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(data, height, width, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(data, height, width, trans);
-  }
-}
-
-MatrixPtr Matrix::create(real* data,
-                         size_t height,
-                         size_t width,
-                         size_t stride,
-                         bool trans,
-                         bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(data, height, width, stride, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(data, height, width, stride, trans);
-  }
-}
-
-MatrixPtr Matrix::createSparseMatrix(size_t height,
-                                     size_t width,
-                                     size_t nnz,
-                                     SparseValueType valueType,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, nnz, valueType, SPARSE_CSR, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, nnz, valueType, SPARSE_CSR, trans);
-  }
-}
-
-void Matrix::resizeOrCreate(
-    MatrixPtr& matrix, size_t height, size_t width, bool trans, bool useGpu) {
-  if (!matrix) {
-    matrix = Matrix::create(height, width, trans, useGpu);
-  } else {
-    CHECK_EQ(matrix->useGpu(), useGpu);
-    matrix->resize(height, width);
-  }
-}
-
-void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix,
-                                        size_t height,
-                                        size_t width,
-                                        size_t nnz,
-                                        SparseValueType valueType,
-                                        SparseFormat format,
-                                        bool trans,
-                                        bool useGpu) {
-  if (!matrix) {
-    matrix = Matrix::createSparseMatrix(
-        height, width, nnz, valueType, format, trans, useGpu);
-  } else {
-    CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
-          dynamic_cast<GpuSparseMatrix*>(matrix.get()));
-    CHECK_EQ(matrix->useGpu(), useGpu);
-    matrix->resize(height, width, nnz, valueType, format);
-  }
-}
-
-void Matrix::reshape(size_t height, size_t width) {
-  CHECK(isContiguous());
-  CHECK(height_ * width_ == height * width);
-  height_ = height;
-  width_ = width;
-  stride_ = width_;
-}
-
-MatrixPtr Matrix::subMatrix(size_t startRow,
-                            size_t endRow,
-                            size_t startCol,
-                            size_t endCol) {
-  CHECK_LE(startRow, endRow);
-  CHECK_LE(endRow, getHeight());
-  CHECK_LE(startCol, endCol);
-  CHECK_LE(endCol, getWidth());
-
-  return Matrix::create(getData() + startRow * getStride() + startCol,
-                        endRow - startRow,
-                        endCol - startCol,
-                        getStride(),
-                        trans_,
-                        useGpu_);
-}
-
-void Matrix::setDiag(real value) {
-  CHECK(data_ != NULL);
-  CHECK_EQ(height_, width_);
-
-  zeroMem();
-  BaseMatrix diag(height_, 1, stride_ + 1, data_, false, useGpu_);
-  diag.assign(value);
-}
-
-GpuMatrix::GpuMatrix(size_t height, size_t width, bool trans)
-    : Matrix(std::make_shared<GpuMemoryHandle>(height * width * sizeof(real)),
-             height,
-             width,
-             trans,
-             true) {}
-
-GpuMatrix::~GpuMatrix() {}
-
-void GpuMatrix::zeroMem() {
-  CHECK(data_ != NULL);
-  zero();
-}
-
-void GpuMatrix::resetOne() {
-  CHECK(data_ != NULL);
-  one();
-}
-
-void GpuMatrix::resize(size_t newHeight, size_t newWidth) {
-  size_t newSize = newHeight * newWidth;
-  if (NULL == memoryHandle_.get() ||
-      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
-    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-  }
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newSize;
-  stride_ = width_;
-}
-
-real GpuMatrix::getElement(size_t x, size_t y) const {
-  real elem = 0;
-  hl_memcpy_device2host(&elem, &data_[x * stride_ + y], sizeof(real));
-  return elem;
-}
-
-real GpuMatrix::getSum() {
-  CHECK(isContiguous());
-  real sum = 0.0f;
-  hl_vector_sum(data_, &sum, height_ * width_);
-  return sum;
-}
-
-real GpuMatrix::getMin() {
-  CHECK(isContiguous());
-  auto vec = GpuVector(height_ * width_, data_);
-  return vec.getMin();
-}
-
-real GpuMatrix::getMax() {
-  CHECK(isContiguous());
-  auto vec = GpuVector(height_ * width_, data_);
-  return vec.getMax();
-}
-
-void GpuMatrix::accumulateColSum(Matrix& src) {
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK_EQ(getHeight(), (size_t)1);
-  sumCols(src, 1.0, 1.0);
-}
-
-real GpuMatrix::getAbsSum() {
-  CHECK(isContiguous());
-  real sum = 0.0f;
-  hl_vector_abs_sum(data_, &sum, height_ * width_);
-  return sum;
-}
-
-void GpuMatrix::copyFrom(const Matrix& src) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-
-  if (typeid(src) == typeid(CpuMatrix)) {
-    hl_memcpy_host2device(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_device2device(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void GpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-  hl_memcpy_async(this->getData(),
-                  const_cast<real*>(src.getData()),
-                  sizeof(real) * elementCnt_,
-                  stream);
-}
-
-void GpuMatrix::copyFrom(const real* hostSrc, size_t size) {
-  CHECK(isContiguous());
-  CHECK(size <= elementCnt_);
-  hl_memcpy_host2device(data_, const_cast<real*>(hostSrc), sizeof(real) * size);
-}
-
-void GpuMatrix::copyFrom(const real* hostSrc, const int64_t* seq) {
-  LOG(FATAL) << "not implemented";
-}
-
-void GpuMatrix::copyFrom(const IVector& src) {
-  CHECK(isContiguous());
-  CpuMatrix matrix(src.getSize(), 1, false);
-  matrix.copyFrom(src);
-  copyFrom(matrix);
-}
-
-void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(b.getWidth(), width);
-  real* dst = getData();
-  real* src = b.getData();
-  const int* index = rowIndex.getData();
-  hl_sequence2batch_copy(dst, src, index, width, height, true);
-}
-
-MatrixPtr GpuMatrix::clone(size_t height, size_t width, bool useGpu) {
-  CHECK(isContiguous());
-
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-
-  CHECK(width && height);
-
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width);
-  }
-}
-
-MatrixPtr GpuMatrix::getTranspose() {
-  if (memoryHandle_.get() != NULL) {
-    MatrixPtr copy_T(
-        new GpuMatrix(std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
-                      height_,
-                      width_,
-                      true));
-    return copy_T;
-  } else {
-    MatrixPtr copy_T(new GpuMatrix(data_, height_, width_, true));
-    return copy_T;
-  }
-}
-
-void GpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  if (memAlloc) {
-    matTrans = std::make_shared<GpuMatrix>(width_, height_);
-  } else {
-    CHECK(matTrans != NULL);
-    CHECK_EQ(matTrans->getHeight(), width_);
-    CHECK_EQ(matTrans->getWidth(), height_);
-  }
-  real* dataTrans = matTrans->getData();
-  real* data = getData();
-  int lda = getStride();
-  int ldc = matTrans->getStride();
-
-  hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
-}
-
-void GpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-  if (memAlloc) {
-    matRot = std::make_shared<GpuMatrix>(width_, height_);
-  } else {
-    CHECK(matRot != NULL);
-    CHECK_EQ(matRot->getHeight(), width_);
-    CHECK_EQ(matRot->getWidth(), height_);
-  }
-
-  real* dataRot = matRot->getData();
-  real* data = getData();
-  hl_matrix_rotate(data, dataRot, height_, width_, clockWise);
-}
-
-MatrixPtr GpuMatrix::getInverse() {
-  MatrixPtr matInv;
-  inverse(matInv, true);
-  return matInv;
-}
-
-void GpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
-  CHECK_EQ(height_, width_);
-
-  if (memAlloc) {
-    matInv = std::make_shared<GpuMatrix>(height_, width_);
-  } else {
-    CHECK(matInv != NULL);
-  }
-
-  real* data = getData();
-  real* dataInv = matInv->getData();
-  int lda = getStride();
-  int ldc = matInv->getStride();
-
-  hl_matrix_inverse(data, dataInv, height_, lda, ldc);
-}
-
-void GpuMatrix::addBias(Matrix& b, real scale) {
-  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
-  BaseMatrix::addBias(b, scale);
-}
-
-void GpuMatrix::addSharedBias(Matrix& b, real scale) {
-  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
-  CHECK_LE(b.getWidth(), getWidth());
-  CHECK_EQ(getWidth() % b.getWidth(), 0UL);
-  hl_matrix_add_shared_bias(
-      getData(), b.getData(), b.getWidth(), getHeight(), getWidth(), scale);
-}
-
-void GpuMatrix::collectBias(Matrix& a, real scale) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(width_, a.getWidth());
-  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
-  if (!sMatPtr) {
-    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
-  } else {
-    real* data = getData();
-    hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
-    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
-  }
-#endif
-}
-
-void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(a.getWidth() % getWidth(), 0UL);
-  hl_matrix_collect_shared_bias(
-      getData(), a.getData(), getWidth(), a.getHeight(), a.getWidth(), scale);
-}
-
-void GpuMatrix::sequenceAvgForward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-
-  hl_sequence_avg_forward(dst, src, starts, height, width, mode);
-}
-
-void GpuMatrix::sequenceAvgBackward(Matrix& a,
-                                    const IVector& startsPos,
-                                    int mode) {
-  size_t height = a.getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-
-  hl_sequence_avg_backward(dst, src, starts, height, width, mode);
-}
-
-/* this = scaleAB*(a*b) +  scaleT*this */
-void GpuMatrix::mul(const GpuMatrix& a,
-                    const GpuMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-
-  if (!a.isTransposed() && !b.isTransposed()) {
-    CHECK_EQ(width_, b.width_);
-    CHECK_EQ(height_, a.height_);
-    CHECK_EQ(a.width_, b.height_);
-  } else if (a.isTransposed() && !b.isTransposed()) {
-    CHECK_EQ(width_, b.width_);
-    CHECK_EQ(height_, a.width_);
-    CHECK_EQ(a.height_, b.height_);
-  } else if (!a.isTransposed() && b.isTransposed()) {
-    CHECK_EQ(width_, b.height_);
-    CHECK_EQ(height_, a.height_);
-    CHECK_EQ(a.width_, b.width_);
-  } else {
-    LOG(FATAL) << "Is not supported";
-  }
-
-  real* A_d = a.data_;
-  real* B_d = b.data_;
-  real* C_d = data_;
-  int dimM = getHeight();
-  int dimN = getWidth();
-  int dimK = !a.isTransposed() ? a.width_ : a.height_;
-  int lda = a.getStride();
-  int ldb = b.getStride();
-  int ldc = getStride();
-  hl_trans_op_t transa = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
-  hl_trans_op_t transb = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
-
-  hl_matrix_mul(A_d,
-                transa,
-                B_d,
-                transb,
-                C_d,
-                dimM,
-                dimN,
-                dimK,
-                scaleAB,
-                scaleT,
-                lda,
-                ldb,
-                ldc);
-}
-
-void GpuMatrix::mul(const GpuSparseMatrix& a,
-                    const GpuMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(isContiguous());
-  CHECK(b.isContiguous());
-  CHECK(b.useGpu_ == true) << "Matrix type are not equal";
-  CHECK(!trans_ && !b.trans_) << "not supported";
-
-  if (!a.trans_) {
-    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
-        << "Matrix dimensions are not equal";
-  } else {
-    CHECK(width_ == b.width_ && height_ == a.width_ && a.height_ == b.height_)
-        << "Matrix dimensions are not equal";
-  }
-  hl_trans_op_t transA = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
-  hl_sparse_matrix_s A_d = a.sMatrix_.get();
-  real* B_d = b.data_;
-  real* C_d = data_;
-  hl_matrix_csr_mul_dense(A_d,
-                          transA,
-                          B_d,
-                          HPPL_OP_N,
-                          C_d,
-                          height_,
-                          width_,
-                          b.height_,
-                          scaleAB,
-                          scaleT);
-#endif
-}
-
-void GpuMatrix::mul(const GpuMatrix& a,
-                    const GpuSparseMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(isContiguous());
-  CHECK(a.isContiguous());
-  CHECK(a.useGpu_ == true) << "Matrix type are not equal";
-
-  hl_sparse_matrix_s B_d = b.sMatrix_.get();
-  real* A_d = a.data_;
-  real* C_d = data_;
-  hl_trans_op_t transB = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
-  if (!b.trans_) {
-    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
-        << "Matrix dimensions are not equal";
-  } else {
-    CHECK(width_ == b.height_ && height_ == a.height_ && a.width_ == b.width_)
-        << "Matrix dimensions are not equal";
-  }
-  if (b.format_ == SPARSE_CSC) {
-    hl_matrix_dense_mul_csc(A_d,
-                            HPPL_OP_N,
-                            B_d,
-                            transB,
-                            C_d,
-                            height_,
-                            width_,
-                            a.width_,
-                            scaleAB,
-                            scaleT);
-  } else {
-    hl_matrix_dense_mul_csr(A_d,
-                            HPPL_OP_N,
-                            B_d,
-                            transB,
-                            C_d,
-                            height_,
-                            width_,
-                            a.width_,
-                            scaleAB,
-                            scaleT);
-  }
-#endif
-}
-
-/* this = a*b */
-void GpuMatrix::mul(const Matrix& a, const Matrix& b) { mul(a, b, 1.0, 0.0); }
-
-void GpuMatrix::mul(const Matrix& a,
-                    const Matrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
-  const auto a_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&a);
-  const auto b_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
-  } else if (a_ptr_s && b_ptr) {
-    mul(*a_ptr_s, *b_ptr, scaleAB, scaleT);
-  } else if (a_ptr && b_ptr_s) {
-    mul(*a_ptr, *b_ptr_s, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-/* this = this* b */
-void GpuMatrix::rightMul(Matrix& b) { rightMul(b, 1.0, 0.0); }
-
-/* this = scaleAB*(this*b) +  scaleT*this */
-void GpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
-  CHECK(dynamic_cast<GpuMatrix*>(&b));
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!b.isTransposed()) << "Not supported";
-  mul(*this, *dynamic_cast<GpuMatrix*>(&b), scaleAB, scaleT);
-}
-
-/* this = a*this */
-void GpuMatrix::leftMul(Matrix& a) { leftMul(a, 1.0, 0.0); }
-
-/* this = scaleAB*(a*this) +  scaleT*this */
-void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!a.isTransposed()) << "Not supported";
-  mul(*dynamic_cast<GpuMatrix*>(&a), *this, scaleAB, scaleT);
-}
-
-void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(dynamic_cast<GpuMatrix*>(&table));
-  CHECK(table.useGpu());
-  CHECK(ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  hl_matrix_select_rows(a,
-                        stride_,
-                        table.getData(),
-                        table.stride_,
-                        index,
-                        numSamples,
-                        tableSize,
-                        dim);
-#endif
-}
-
-void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(dynamic_cast<GpuMatrix*>(&table));
-  CHECK(table.useGpu());
-  CHECK(ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  hl_matrix_add_to_rows(table.getData(),
-                        table.stride_,
-                        a,
-                        stride_,
-                        index,
-                        numSamples,
-                        tableSize,
-                        dim);
-#endif
-}
-
-void GpuMatrix::colMerge(Matrix& src) {
-  CHECK(src.height_ == height_);
-  if (!trans_ && !src.trans_) {
-    sumRows(src, /* scaleSum= */ 1, /* scaleDest= */ 0);
-  } else {
-    LOG(FATAL) << "Is not supported";
-  }
-}
-
-void GpuMatrix::rowSum(Matrix& sum) {
-  CHECK_EQ(sum.getHeight(), getHeight());
-  CHECK_EQ(sum.getWidth(), (size_t)1);
-
-  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
-}
-
-void GpuMatrix::rowMax(Matrix& max) {
-  CHECK_EQ(max.getHeight(), getHeight());
-  CHECK_EQ(max.getWidth(), (size_t)1);
-
-  max.maxRows(*this);
-}
-
-void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  CHECK_EQ(maxVal.getWidth(), beam);
-
-  hl_matrix_top_k(maxVal.getData(),
-                  maxVal.getStride(),
-                  maxIds.getData(),
-                  this->getData(),
-                  this->getStride(),
-                  this->getWidth(),
-                  beam,
-                  numSamples);
-#endif
-}
-
-void GpuMatrix::colMax(Matrix& max) {
-  CHECK_EQ(max.getWidth(), getWidth());
-  CHECK_EQ(max.getHeight(), (size_t)1);
-
-  max.maxCols(*this);
-}
-
-void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
-  LOG(FATAL) << "Is not supported";
-}
-
-void GpuMatrix::maxoutForward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(dynamic_cast<GpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = getWidth();
-  size_t batchSize = getHeight();
-  const real* input = a.getData();
-  real* output = getData();
-  int* idForGpu = id.getData();
-
-  hl_maxout_forward(
-      input, output, idForGpu, batchSize, size, size / channels, groups);
-}
-
-void GpuMatrix::maxoutBackward(Matrix& a,
-                               IVector& id,
-                               size_t channels,
-                               size_t groups) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(dynamic_cast<GpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = a.getWidth();
-  size_t batchSize = getHeight();
-  real* input = getData();
-  const real* output = a.getData();
-  const int* idForGpu = id.getData();
-
-  hl_maxout_backward(
-      input, output, idForGpu, batchSize, size, size / channels, groups);
-}
-
-/*calulate the error of classification */
-void GpuMatrix::classificationError(Matrix& output,
-                                    IVector& label,
-                                    size_t topkSize) {
-  auto gpuOutput = dynamic_cast<GpuMatrix*>(&output);
-  auto gpuLabel = dynamic_cast<GpuIVector*>(&label);
-  size_t numSamples = this->getHeight();
-  GpuMatrixPtr gpuTopVal = std::make_shared<GpuMatrix>(numSamples, topkSize);
-  GpuIVectorPtr gpuTopIds = std::make_shared<GpuIVector>(numSamples * topkSize);
-
-  CHECK(gpuOutput && gpuLabel) << "Invalid argument pointer";
-  CHECK(gpuTopVal && gpuTopIds) << "Allocate GPU memory failed";
-  CHECK(gpuLabel->getSize() == numSamples) << "Vector size is not equal";
-  CHECK(numSamples == gpuOutput->getHeight() && this->getWidth() == 1)
-      << "Matrix dimensions are not equal";
-
-  size_t dim = gpuOutput->getWidth();
-  hl_matrix_classification_error(gpuTopVal->getData(),
-                                 gpuTopVal->getStride(),
-                                 gpuTopIds->getData(),
-                                 gpuOutput->getData(),
-                                 gpuOutput->getStride(),
-                                 dim,
-                                 topkSize,
-                                 numSamples,
-                                 gpuLabel->getData(),
-                                 this->getData());
-}
-
-/* copy -log(output[i * width + label]) to this->data[i] */
-void GpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
-  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&output);
-  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
-
-  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
-
-  CHECK(height_ == label.getSize() && width_ == 1 && height_ == output.height_)
-      << "Matrix dimensions are not equal";
-
-  real* A_d = output_ptr->data_;
-  real* C_d = data_;
-  int* label_d = label_ptr->getData();
-
-  hl_matrix_cross_entropy(A_d, C_d, label_d, height_, output.width_);
-}
-
-/* calculate the error of outputV according to label */
-void GpuMatrix::oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
-  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&outputV);
-  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
-
-  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
-
-  CHECK(height_ == output_ptr->height_ && width_ == output_ptr->width_)
-      << "Matrix dimensions are not equal";
-
-  real* output_d = output_ptr->data_;
-  real* grad_d = data_;
-  int* label_d = label_ptr->getData();
-
-  hl_matrix_cross_entropy_bp(grad_d, output_d, label_d, height_, width_);
-}
-
-void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                               IVector& label,
-                                               real alpha) {
-  LOG(FATAL) << "Not implemented";
-}
-
-void GpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                 IVector& label,
-                                                 real alpha) {
-  LOG(FATAL) << "Not implemented";
-}
-
-void GpuMatrix::softmax(Matrix& output) {
-  CHECK(output.useGpu()) << "Matrix type are not equal";
-
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK(height == output.getHeight() && width == output.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* inputData = getData();
-  real* outputData = output.getData();
-  hl_matrix_softmax(inputData, outputData, height, width);
-}
-
-void GpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-  CHECK(isContiguous());
-
-  real* inputData = getData();
-  real* outputData = output.getData();
-  auto starts = index.getData();
-  int numSequences = index.getSize() - 1;
-  hl_sequence_softmax_forward(inputData, outputData, starts, numSequences);
-}
-
-void GpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-  CHECK(output.useGpu_ == true && sftmaxSum.useGpu_ == true)
-      << "Matrix type are not equal";
-
-  CHECK(height_ == output.height_ && width_ == output.width_ &&
-        height_ == sftmaxSum.height_)
-      << "Matrix dimensions are not equal";
-
-  real* output_d = output.data_;
-  real* sftmaxSum_d = sftmaxSum.data_;
-  real* grad_d = data_;
-  hl_matrix_softmax_derivative(grad_d, output_d, sftmaxSum_d, height_, width_);
-}
-
-void GpuMatrix::softmaxBackward(Matrix& outputV) {
-  CHECK(outputV.useGpu()) << "Matrix type are not equal";
-
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK(height == outputV.getHeight() && width == outputV.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* output_grad = getData();
-  real* output_value = outputV.getData();
-  hl_softmax_backward(output_value, output_grad, height, width);
-}
-
-void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
-  CHECK_EQ(label.getHeight(), height_);
-  CHECK_EQ(output.getHeight(), height_);
-  CHECK_EQ(label.getWidth(), output.getWidth());
-  CHECK_EQ((size_t)1, width_);
-
-  auto labelptr = dynamic_cast<GpuSparseMatrix*>(&label);
-  if (labelptr) {
-    LOG(FATAL) << "not supported: GpuSparseMatrix as label";
-  }
-
-  BaseMatrix::sumOfSquaredDiffs(output,
-                                label,
-                                /* scaleSum= */ 1,
-                                /* scaleDest= */ 1);
-}
-
-void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) {
-  add2(outputV, label, 1, 2, -2);
-}
-
-void GpuMatrix::tanh(Matrix& output) { BaseMatrix::tanh(output); }
-
-void GpuMatrix::tanhDerivative(Matrix& output) {
-  BaseMatrix::tanhDerivative(output);
-}
-
-void GpuMatrix::softrelu(Matrix& output) { BaseMatrix::softrelu(output); }
-
-void GpuMatrix::softreluDerivative(Matrix& output) {
-  BaseMatrix::softreluDerivative(output);
-}
-
-void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
-  BaseMatrix::scaledTanh(output, p1, p2);
-}
-
-void GpuMatrix::randomizeUniform() {
-  CHECK(isContiguous());
-  real* data = data_;
-  size_t size = height_ * width_;
-
-  hl_rand(data, size);
-}
-
-void GpuMatrix::print(std::ostream& os) const {
-  CHECK(isContiguous());
-  CpuMatrix cpuMat(getHeight(), getWidth());
-  cpuMat.copyFrom(*this);
-  cpuMat.print(os);
-}
-
-void GpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
-  CHECK(isContiguous());
-  CpuMatrix cpuMat(getHeight(), getWidth());
-  cpuMat.copyFrom(*this);
-  cpuMat.print(os, height, width);
-}
-
-void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
-  CHECK(isContiguous());
-  CHECK(height_ == refMat.getHeight());
-  CHECK(width_ == refMat.getWidth());
-  CpuMatrix cpuRef(height_, width_);
-  GpuMatrix gpuRef(height_, width_);
-  cpuRef.copyFrom(refMat);
-  gpuRef.copyFrom(*this);
-  size_t diffCnt = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      real a = gpuRef.getElement(i, j);
-      real b = cpuRef.getElement(i, j);
-      if (fabs(a - b) > 0.00001) {
-        ++diffCnt;
-        if (printDiff) {
-          os << "ref= " << a << "  check= " << b << std::endl;
-        }
-      }
-    }
-  }
-  LOG(INFO) << "the  diffCnt is " << diffCnt;
-}
-
-void GpuMatrix::upsampleForward(Matrix& input,
-                                Matrix& mask,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW) {
-  CHECK(input.useGpu_ == true) << "Matrix type are not equal";
-  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
-
-  real* inputData = input.getData();
-  real* maskData = mask.getData();
-  real* outData = data_;
-
-  size_t batch = input.getHeight();
-
-  CHECK(imgSizeH * imgSizeW * channels == input.getWidth());
-  CHECK(imgSizeH * imgSizeW * channels == mask.getWidth());
-  CHECK_EQ(batch, this->getHeight());
-  CHECK(width_ == outputH * outputW * channels);
-  hl_upsample_forward(inputData,
-                      maskData,
-                      batch,
-                      imgSizeH,
-                      imgSizeW,
-                      channels,
-                      outputH,
-                      outputW,
-                      outData);
-}
-
-void GpuMatrix::upsampleBackward(Matrix& outputGrad,
-                                 Matrix& mask,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t channels,
-                                 size_t outputH,
-                                 size_t outputW) {
-  CHECK(outputGrad.useGpu_ == true) << "Matrix type are not equal";
-  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
-
-  real* outputGradData = outputGrad.getData();
-  real* maskData = mask.getData();
-  real* inputGradData = data_;
-  size_t batch = outputGrad.getHeight();
-
-  CHECK(imgSizeH * imgSizeW == this->getWidth() / channels);
-  CHECK_EQ(batch, this->getHeight());
-  CHECK_EQ(channels * outputH * outputW, outputGrad.getWidth());
-  hl_upsample_backward(outputGradData,
-                       maskData,
-                       batch,
-                       imgSizeH,
-                       imgSizeW,
-                       channels,
-                       outputH,
-                       outputW,
-                       inputGradData);
-}
-
-void GpuMatrix::maxPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               MatrixPtr maskMatP) {
-  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  real* maskData = NULL;
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputH * outputW * channels);
-
-  if (maskMatP != NULL) {
-    CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
-    CHECK(outputH * outputW * channels == maskMatP->getWidth());
-    maskData = maskMatP->getData();
-  }
-
-  hl_maxpool_forward(frameNum,
-                     inputData,
-                     channels,
-                     imgSizeH,
-                     imgSizeW,
-                     outputH,
-                     outputW,
-                     sizeX,
-                     sizeY,
-                     strideH,
-                     strideW,
-                     paddingH,
-                     paddingW,
-                     data_,
-                     getStride(),
-                     maskData);
-}
-
-void GpuMatrix::maxPoolBackward(Matrix& inputMat,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                Matrix& outGrad,
-                                Matrix& outV,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW) {
-  CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
-        outV.useGpu_ == true)
-      << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  real* outData = outV.getData();
-  real* outDiff = outGrad.getData();
-  size_t frameNum = inputMat.getHeight();
-  size_t channels = outV.getWidth() / outputH / outputW;
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(outGrad.getHeight() == outV.getHeight() &&
-        outGrad.getWidth() == outV.getWidth());
-
-  hl_maxpool_backward(frameNum,
-                      inputData,
-                      outData,
-                      outDiff,
-                      channels,
-                      imgSizeH,
-                      imgSizeW,
-                      outputH,
-                      outputW,
-                      sizeX,
-                      sizeY,
-                      strideH,
-                      strideW,
-                      paddingH,
-                      paddingW,
-                      scaleTargets,
-                      scaleOutput,
-                      data_,
-                      outGrad.getStride());
-}
-
-void GpuMatrix::avgPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode) {
-  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputH * outputW * channels);
-
-  hl_avgpool_forward(frameNum,
-                     inputData,
-                     channels,
-                     imgSizeH,
-                     imgSizeW,
-                     outputH,
-                     outputW,
-                     sizeX,
-                     sizeY,
-                     strideH,
-                     strideW,
-                     paddingH,
-                     paddingW,
-                     data_,
-                     getStride(),
-                     excludeMode);
-}
-
-void GpuMatrix::avgPoolBackward(Matrix& outGrad,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW,
-                                bool excludeMode) {
-  CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  size_t frameNum = outGrad.getHeight();
-  size_t channels = outGrad.getWidth() / outputH / outputW;
-  CHECK(imgSizeH * imgSizeW * channels == width_);
-  CHECK(height_ == outGrad.getHeight());
-  CHECK(outGrad.getWidth() == outputH * outputW * channels);
-
-  hl_avgpool_backward(frameNum,
-                      outDiff,
-                      channels,
-                      imgSizeH,
-                      imgSizeW,
-                      outputH,
-                      outputW,
-                      sizeX,
-                      sizeY,
-                      strideH,
-                      strideW,
-                      paddingH,
-                      paddingW,
-                      scaleTargets,
-                      scaleOutput,
-                      data_,
-                      outGrad.getStride(),
-                      excludeMode);
-}
-
-void GpuMatrix::maxPool3DForward(Matrix& inputMat,
-                                 Matrix& maxPoolIdx,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  CHECK(inputMat.useGpu_) << "Matrix type are not correct";
-
-  real* inputData = inputMat.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t num = inputMat.getHeight();
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputD * outputH * outputW * channels);
-
-  hl_maxpool3D_forward(num,
-                       inputData,
-                       channels,
-                       imgSizeD,
-                       imgSizeH,
-                       imgSizeW,
-                       outputD,
-                       outputH,
-                       outputW,
-                       sizeZ,
-                       sizeY,
-                       sizeX,
-                       strideD,
-                       strideH,
-                       strideW,
-                       paddingD,
-                       paddingH,
-                       paddingW,
-                       getData(),
-                       maxPoolIdxData,
-                       getStride());
-}
-
-void GpuMatrix::maxPool3DBackward(Matrix& outGrad,
-                                  Matrix& maxPoolIdx,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  CHECK(outGrad.useGpu_ && maxPoolIdx.useGpu_) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t frameNum = getHeight();
-  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth());
-  CHECK(outGrad.getHeight() == maxPoolIdx.getHeight() &&
-        outGrad.getWidth() == maxPoolIdx.getWidth());
-
-  hl_maxpool3D_backward(frameNum,
-                        outDiff,
-                        channels,
-                        imgSizeD,
-                        imgSizeH,
-                        imgSizeW,
-                        outputD,
-                        outputH,
-                        outputW,
-                        sizeZ,
-                        sizeY,
-                        sizeX,
-                        strideD,
-                        strideH,
-                        strideW,
-                        paddingD,
-                        paddingH,
-                        paddingW,
-                        scaleTargets,
-                        scaleOutput,
-                        getData(),
-                        maxPoolIdxData,
-                        outGrad.getStride());
-}
-
-void GpuMatrix::avgPool3DForward(Matrix& inputMat,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  CHECK(inputMat.useGpu_) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputD * outputH * outputW * channels);
-
-  hl_avgpool3D_forward(frameNum,
-                       inputData,
-                       channels,
-                       imgSizeD,
-                       imgSizeH,
-                       imgSizeW,
-                       outputD,
-                       outputH,
-                       outputW,
-                       sizeZ,
-                       sizeY,
-                       sizeX,
-                       strideD,
-                       strideH,
-                       strideW,
-                       paddingD,
-                       paddingH,
-                       paddingW,
-                       getData(),
-                       getStride());
-}
-
-void GpuMatrix::avgPool3DBackward(Matrix& outGrad,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  CHECK(outGrad.useGpu_) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  size_t frameNum = outGrad.getHeight();
-  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == width_);
-  CHECK(height_ == outGrad.getHeight());
-  CHECK(outGrad.getWidth() == outputD * outputH * outputW * channels);
-
-  hl_avgpool3D_backward(frameNum,
-                        outDiff,
-                        channels,
-                        imgSizeD,
-                        imgSizeH,
-                        imgSizeW,
-                        outputD,
-                        outputH,
-                        outputW,
-                        sizeZ,
-                        sizeY,
-                        sizeX,
-                        strideD,
-                        strideH,
-                        strideW,
-                        paddingD,
-                        paddingH,
-                        paddingW,
-                        scaleTargets,
-                        scaleOutput,
-                        getData(),
-                        outGrad.getStride());
-}
-
-void GpuMatrix::maxSequenceForward(Matrix& input,
-                                   const IVector& sequence,
-                                   IVector& index) {
-  CHECK(dynamic_cast<GpuMatrix*>(&input));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK(dynamic_cast<GpuIVector*>(&index));
-
-  real* outData = getData();
-  real* inputData = input.getData();
-  const int* starts = sequence.getData();
-  int* maxIndex = index.getData();
-  size_t numSequences = getHeight();
-  size_t dim = getWidth();
-
-  CHECK_EQ(dim, input.getWidth());
-  CHECK_EQ(numSequences, sequence.getSize() - 1);
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  hl_max_sequence_forward(
-      inputData, starts, outData, maxIndex, numSequences, dim);
-}
-
-void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
-                                    const IVector& sequence,
-                                    IVector& index) {
-  CHECK(dynamic_cast<GpuMatrix*>(&outputGrad));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK(dynamic_cast<GpuIVector*>(&index));
-
-  real* inputGrad = getData();
-  real* outGrad = outputGrad.getData();
-  int* maxIndex = index.getData();
-  size_t dim = getWidth();
-  size_t numSequences = sequence.getSize() - 1;
-
-  CHECK_EQ(dim, outputGrad.getWidth());
-  CHECK_EQ(numSequences, outputGrad.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
-}
-
-void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
-  CHECK(data.useGpu_ == true && W.useGpu_ == true)
-      << "Matrix type are not equal";
-  real* input = data.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  real* output = getData();
-  hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-  CHECK(oGrad.useGpu_ == true && data.useGpu_ == true)
-      << "Matrix type are not equal";
-  real* ograd = oGrad.getData();
-  real* input = data.getData();
-  real* wgrad = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = this->getHeight() * this->getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  hl_param_relu_backward_w(
-      wgrad, ograd, input, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-  real* diff = data_;
-  real* input = data.getData();
-  real* ograd = oGrad.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  hl_param_relu_backward_diff(
-      ograd, input, w, diff, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::addColumnVector(const Matrix& b) {
-  BaseMatrix::addColVector(const_cast<Matrix&>(b));
-}
-
-void GpuMatrix::bilinearForward(const Matrix& in,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-  CHECK(dynamic_cast<const GpuMatrix*>(&in));
-
-  const size_t outputW = getWidth();
-  const size_t outputH = getHeight();
-  const size_t inputW = in.getWidth();
-  const size_t inputH = in.getHeight();
-
-  real* outData = getData();
-  const real* inData = in.getData();
-
-  if (inImgH == outImgW && inImgW == outImgW) {
-    this->copyFrom(in);
-  } else {
-    hl_bilinear_forward(inData,
-                        inImgH,
-                        inImgW,
-                        inputH,
-                        inputW,
-                        outData,
-                        outImgH,
-                        outImgW,
-                        outputH,
-                        outputW,
-                        numChannels,
-                        ratioH,
-                        ratioW);
-  }
-}
-
-void GpuMatrix::bilinearBackward(const Matrix& out,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW) {
-  CHECK(dynamic_cast<const GpuMatrix*>(&out));
-
-  const size_t inputW = getWidth();
-  const size_t inputH = getHeight();
-  const size_t outputW = out.getWidth();
-  const size_t outputH = out.getHeight();
-
-  real* inGrad = getData();
-  const real* outGrad = out.getData();
-
-  if (outImgH == inImgH && outImgW == inImgW) {
-    this->add(const_cast<Matrix&>(out));
-  } else {
-    hl_bilinear_backward(inGrad,
-                         inImgH,
-                         inImgW,
-                         inputH,
-                         inputW,
-                         outGrad,
-                         outImgH,
-                         outImgW,
-                         outputH,
-                         outputW,
-                         numChannels,
-                         ratioH,
-                         ratioW);
-  }
-}
-
-void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-#ifdef PADDLE_WITH_CUDA
-  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
-  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
-
-  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
-  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
-  CHECK(height_ == outputPtr->height_ && width_ == 1 &&
-        outputPtr->width_ == labelPtr->getWidth() &&
-        outputPtr->height_ == labelPtr->getHeight())
-      << "Matrix dimensions are not equal";
-
-  real* output_d = outputPtr->data_;
-  real* entropy_d = data_;
-  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
-  hl_matrix_multi_binary_cross_entropy(
-      output_d, entropy_d, mat_d, height_, outputPtr->width_);
-#endif
-}
-
-void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-#ifdef PADDLE_WITH_CUDA
-  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
-  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
-
-  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
-  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
-  CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_ &&
-        outputPtr->width_ == labelPtr->getWidth() &&
-        outputPtr->height_ == labelPtr->getHeight())
-      << "Matrix dimensions are not equal";
-
-  real* output_d = outputPtr->data_;
-  real* grad_d = data_;
-  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
-  hl_matrix_multi_binary_cross_entropy_bp(
-      output_d, grad_d, mat_d, height_, width_);
-#endif
-}
-
-void GpuMatrix::vol2Col(real* dataSrc,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW) {
-  hl_matrix_vol2Col(dataSrc,
-                    channels,
-                    depth,
-                    height,
-                    width,
-                    filterD,
-                    filterH,
-                    filterW,
-                    strideD,
-                    strideH,
-                    strideW,
-                    paddingD,
-                    paddingH,
-                    paddingW,
-                    getData());
-}
-
-void GpuMatrix::col2Vol(real* dataDst,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW,
-                        real alpha,
-                        real beta) {
-  hl_matrix_col2Vol(dataDst,
-                    channels,
-                    depth,
-                    height,
-                    width,
-                    filterD,
-                    filterH,
-                    filterW,
-                    strideD,
-                    strideH,
-                    strideW,
-                    paddingD,
-                    paddingH,
-                    paddingW,
-                    getData(),
-                    alpha,
-                    beta);
-}
-
-/**
- * CpuMatrix
- */
-
-CpuMatrix::CpuMatrix(size_t height, size_t width, bool trans)
-    : Matrix(std::make_shared<CpuMemoryHandle>(height * width * sizeof(real)),
-             height,
-             width,
-             trans,
-             false) {}
-
-CpuMatrix::~CpuMatrix() {}
-
-void CpuMatrix::zeroMem() {
-  CHECK(data_ != NULL);
-  if (isContiguous()) {
-    memset(data_, 0, height_ * width_ * sizeof(real));
-  } else {
-    BaseMatrix::zero();
-  }
-}
-void CpuMatrix::resetOne() {
-  CHECK(data_ != NULL);
-  BaseMatrix::one();
-}
-
-void CpuMatrix::copyFrom(const Matrix& src) {
-  CHECK(isContiguous());
-  if (typeid(src) == typeid(GpuMatrix)) {
-    CHECK(src.isContiguous());
-    CHECK(elementCnt_ == src.getElementCnt());
-    hl_memcpy_device2host(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(CpuMatrix) ||
-             typeid(src) == typeid(SharedCpuMatrix)) {
-    CHECK(src.isContiguous());
-    CHECK(elementCnt_ == src.getElementCnt());
-    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(CpuSparseMatrix)) {
-    CHECK_GE(elementCnt_, src.getElementCnt());
-    copyFrom(dynamic_cast<CpuSparseMatrix&>(const_cast<Matrix&>(src)));
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void CpuMatrix::copyFrom(CpuSparseMatrix& src) {
-  CHECK(isContiguous());
-  CHECK(height_ == src.getHeight());
-  CHECK(width_ == src.getWidth());
-  memset(data_, 0, sizeof(real) * height_ * width_);
-  if (src.getValueType() == FLOAT_VALUE) {
-    if (src.getFormat() == SPARSE_CSC) {
-      int* rows = src.getRows();
-      real* vals = src.getValue();
-      for (size_t i = 0; i < width_; i++) {
-        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
-             j++) {
-          data_[rows[j] * width_ + i] = vals[j];
-        }
-      }
-    } else {
-      int* cols = src.getCols();
-      real* vals = src.getValue();
-      for (size_t i = 0; i < height_; i++) {
-        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
-             j++) {
-          data_[i * width_ + cols[j]] = vals[j];
-        }
-      }
-    }
-  } else {
-    if (src.getFormat() == SPARSE_CSC) {
-      int* rows = src.getRows();
-      for (size_t i = 0; i < width_; i++) {
-        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
-             j++) {
-          data_[rows[j] * width_ + i] = 1.0;
-        }
-      }
-    } else {
-      int* cols = src.getCols();
-      for (size_t i = 0; i < height_; i++) {
-        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
-             j++) {
-          data_[i * width_ + cols[j]] = 1.0;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-  if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_async(this->getData(),
-                    const_cast<real*>(src.getData()),
-                    sizeof(real) * elementCnt_,
-                    stream);
-    // There is a need to add synchronization to ensure that the data is copied.
-    hl_stream_synchronize(stream);
-  } else if (typeid(src) == typeid(CpuMatrix)) {
-    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void CpuMatrix::copyFrom(const real* cpuSrc, size_t size) {
-  CHECK(isContiguous());
-  CHECK(size <= elementCnt_);
-  memcpy(data_, cpuSrc, sizeof(real) * size);
-}
-
-void CpuMatrix::copyFrom(const real* cpuSrc, const int64_t* seq) {
-  CHECK(isContiguous());
-  for (size_t i = 0; i < height_; i++) {
-    memcpy(data_ + i * width_, cpuSrc + seq[i] * width_, sizeof(real) * width_);
-  }
-}
-
-void CpuMatrix::copyFrom(const IVector& src) {
-  CHECK(isContiguous());
-  CHECK(elementCnt_ == src.getSize())
-      << "the src and dst should have same size.";
-  const int* cpuSrc = NULL;
-  IVectorPtr tmp;
-  if (src.useGpu()) {
-    CpuIVector tmp(src.getSize());
-    tmp.copyFrom(src);
-    cpuSrc = tmp.getData();
-  } else {
-    cpuSrc = src.getData();
-  }
-  for (size_t i = 0; i < elementCnt_; ++i) {
-    data_[i] = cpuSrc[i];
-  }
-}
-
-void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(b.getWidth(), width);
-  const int* index = rowIndex.getData();
-  for (size_t i = 0; i < height; i++) {
-    CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
-    real* src = b.getData() + index[i] * width;
-    real* dst = getData() + i * width;
-    memcpy(dst, src, sizeof(real) * width);
-  }
-}
-
-MatrixPtr CpuMatrix::clone(size_t height, size_t width, bool useGpu) {
-  CHECK(isContiguous());
-
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-
-  CHECK(width && height);
-
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width);
-  }
-}
-
-void CpuMatrix::resize(size_t newHeight, size_t newWidth) {
-  size_t newSize = newHeight * newWidth;
-  if (NULL == memoryHandle_.get() ||
-      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
-    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newSize;
-  stride_ = width_;
-}
-
-real CpuMatrix::getElement(size_t x, size_t y) const {
-  return data_[x * stride_ + y];
-}
-
-real CpuMatrix::getSum() {
-  CHECK(isContiguous());
-  double sum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      sum += data_[i * width_ + j];
-    }
-  }
-  return sum;
-}
-
-void CpuMatrix::accumulateColSum(Matrix& src) {
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK_EQ(getHeight(), (size_t)1);
-
-  sumCols(src, /* scaleSum= */ 1, /* scaleDest= */ 1);
-}
-
-real CpuMatrix::getAbsSum() {
-  CHECK(isContiguous());
-  double sum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      sum += fabs(data_[i * width_ + j]);
-    }
-  }
-  return sum;
-}
-
-MatrixPtr CpuMatrix::getTranspose() {
-  if (memoryHandle_.get() != NULL) {
-    return std::make_shared<CpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
-        height_,
-        width_,
-        true);
-  } else {
-    MatrixPtr copy_T(new CpuMatrix(data_, height_, width_, true));
-    return copy_T;
-  }
-}
-
-void CpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  if (memAlloc) {
-    matTrans = std::make_shared<CpuMatrix>(width_, height_);
-  } else {
-    CHECK(matTrans != NULL);
-    CHECK_EQ(matTrans->getHeight(), width_);
-    CHECK_EQ(matTrans->getWidth(), height_);
-  }
-  real* dataTrans = matTrans->getData();
-  real* data = getData();
-  int lda = getStride();
-  int ldc = matTrans->getStride();
-
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      dataTrans[j * ldc + i] = data[i * lda + j];
-    }
-  }
-}
-
-void CpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-  if (memAlloc) {
-    matRot = std::make_shared<CpuMatrix>(width_, height_);
-  } else {
-    CHECK(matRot != NULL);
-    CHECK_EQ(matRot->getHeight(), width_);
-    CHECK_EQ(matRot->getWidth(), height_);
-  }
-  real* dataRot = matRot->getData();
-  real* data = getData();
-
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      if (clockWise) {
-        dataRot[j * height_ + i] = data[(height_ - i - 1) * width_ + j];
-      } else {
-        dataRot[j * height_ + i] = data[i * width_ + (width_ - j - 1)];
-      }
-    }
-  }
-}
-
-MatrixPtr CpuMatrix::getInverse() {
-  MatrixPtr matInv;
-  inverse(matInv, true);
-  return matInv;
-}
-
-void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
-  CHECK_EQ(height_, width_);
-
-  if (memAlloc) {
-    matInv = std::make_shared<CpuMatrix>(height_, width_);
-  } else {
-    CHECK(matInv != NULL);
-  }
-
-  CHECK_EQ(height_, matInv->getHeight());
-  CHECK_EQ(width_, matInv->getWidth());
-  matInv->copyFrom(*this);
-
-  real* data = getData();
-  real* dataInv = matInv->getData();
-  int ldc = matInv->getStride();
-
-  if (height_ == 1) {
-    CHECK_NE(*data, 0);
-    *dataInv = 1.0 / (*data);
-    return;
-  }
-
-  /* Compute the LU decomposition of the matrix */
-  std::vector<int> ipiv(height_);
-  CBLAS_ORDER order = (matInv->isTransposed() ? CblasColMajor : CblasRowMajor);
-  int info = getrf<real>(order, height_, height_, dataInv, ldc, ipiv.data());
-  CHECK_EQ(info, 0);
-
-  /* Compute the inverse of the matrix given its LU decompsotion */
-  info = getri<real>(order, height_, dataInv, ldc, ipiv.data());
-  CHECK_EQ(info, 0);
-}
-
-void CpuMatrix::upsampleForward(Matrix& input,
-                                Matrix& mask,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW) {
-  real* inputData = input.getData();
-  real* maskData = mask.getData();
-  real* outData = data_;
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  size_t batch = input.getHeight();
-  CHECK(inLength == input.getWidth() / channels);
-  CHECK_EQ(batch, this->getHeight());
-  CHECK_EQ(channels * outLength, this->getWidth());
-
-  for (size_t k = 0; k < batch; k++) {
-    for (size_t c = 0; c < channels; c++) {
-      for (size_t i = 0; i < inLength; i++) {
-        size_t out_index = static_cast<int>(maskData[i]);
-        if (out_index >= outLength) {
-          LOG(FATAL) << "upsample index " << out_index << " out of range.";
-        }
-        outData[out_index] = inputData[i];
-      }
-      inputData += inLength;
-      maskData += inLength;
-      outData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::upsampleBackward(Matrix& outputGrad,
-                                 Matrix& mask,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t channels,
-                                 size_t outputH,
-                                 size_t outputW) {
-  real* outputGradData = outputGrad.getData();
-  real* maskData = mask.getData();
-  real* inputGradData = data_;
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  size_t batch = outputGrad.getHeight();
-  CHECK(inLength == this->getWidth() / channels);
-  CHECK_EQ(batch, this->getHeight());
-  CHECK_EQ(channels * outLength, outputGrad.getWidth());
-
-  for (size_t k = 0; k < batch; k++) {
-    for (size_t c = 0; c < channels; c++) {
-      for (size_t i = 0; i < inLength; i++) {
-        size_t out_index = static_cast<int>(maskData[i]);
-        if (out_index >= outLength) {
-          LOG(FATAL) << "upsample index " << out_index << " out of range.";
-        }
-        inputGradData[i] = outputGradData[out_index];
-      }
-      inputGradData += inLength;
-      maskData += inLength;
-      outputGradData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               MatrixPtr maskMatP) {
-  real* inputData = inputMat.getData();
-  real* outData = data_;
-  real* maskData = NULL;
-  size_t num = inputMat.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength == inputMat.getWidth() / channels);
-  CHECK_EQ(num, this->getHeight());
-  CHECK_EQ(channels * outLength, this->getWidth());
-  size_t outStride = getStride();
-
-  if (maskMatP != NULL) {
-    maskData = maskMatP->getData();
-    CHECK_EQ(channels * outLength, maskMatP->getWidth());
-  }
-
-  /* pool max one by one */
-  for (size_t n = 0; n < num; ++n) {  // frame by frame
-    if (!isContiguous()) {
-      outData = data_ + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {  // channel by channel
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = hstart + sizeY;
-        hstart = hstart < 0 ? 0 : hstart;
-        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = wstart + sizeX;
-          wstart = wstart < 0 ? 0 : wstart;
-          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
-
-          real maxval = -(real)FLT_MAX;
-          int max_index = -1;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              if (maxval < inputData[h * imgSizeW + w]) {
-                maxval = inputData[h * imgSizeW + w];
-                max_index = h * imgSizeW + w;
-              }
-            }
-          }
-
-          outData[ph * outputW + pw] = maxval;
-          if (maskData != NULL) maskData[ph * outputW + pw] = max_index;
-        }
-      }
-      // compute offset
-      inputData += inLength;
-      outData += outLength;
-
-      if (maskData != NULL) maskData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPoolBackward(Matrix& image,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                Matrix& outGrad,
-                                Matrix& outV,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW) {
-  size_t num = image.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  size_t channels = size_t(width_ / inLength);
-  CHECK(image.getWidth() == inLength * channels);
-  CHECK(image.getHeight() == height_ && image.getWidth() == width_);
-  CHECK(outV.getHeight() == outGrad.getHeight() &&
-        outV.getWidth() == outGrad.getWidth());
-
-  real* tgtGrad = data_;
-  real* inData = image.getData();
-  real* otData = outV.getData();
-  real* otGrad = outGrad.getData();
-
-  size_t outStride = outV.getStride();
-  real* origOutData = otData;
-  real* origOutGrad = otGrad;
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!outV.isContiguous()) {
-      otData = origOutData + n * outStride;
-      otGrad = origOutGrad + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              tgtGrad[h * imgSizeW + w] =
-                  scaleTargets * tgtGrad[h * imgSizeW + w] +
-                  scaleOutput * otGrad[ph * outputW + pw] *
-                      (inData[h * imgSizeW + w] == otData[ph * outputW + pw]);
-            }
-          }
-        }
-      }
-      // offset
-      inData += inLength;
-      tgtGrad += inLength;
-      otData += outLength;
-      otGrad += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPoolForward(Matrix& input,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode) {
-  // The main loop
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength * channels == input.getWidth());
-  CHECK(outLength * channels * num == height_ * width_);
-  real* tgtData = data_;
-  real* inData = input.getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!isContiguous()) {
-      tgtData = data_ + n * getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          tgtData[ph * outputW + pw] = 0;  // clear
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
-            }
-          }
-          int poolSize =
-              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-          CHECK(poolSize);
-          tgtData[ph * outputW + pw] /= poolSize;
-        }
-      }
-      // compute offset
-      inData += inLength;
-      tgtData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPoolBackward(Matrix& input,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW,
-                                bool excludeMode) {
-  size_t num = input.getHeight();
-  size_t channels = input.getWidth() / outputH / outputW;
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength * channels == getWidth());
-  real* inData = input.getData();
-  real* outData = getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!input.isContiguous()) {
-      inData = input.getData() + n * input.getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          int poolSize =
-              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-          CHECK(poolSize);
-
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              outData[h * imgSizeW + w] += inData[ph * outputW + pw] / poolSize;
-            }
-          }
-        }
-      }
-      // offset
-      outData += inLength;
-      inData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPool3DForward(Matrix& inputMat,
-                                 Matrix& maxPoolIdx,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  real* inputData = inputMat.getData();
-  real* outData = getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t num = inputMat.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  CHECK(inLength == inputMat.getWidth() / channels);
-  CHECK_EQ(num, this->getHeight());
-  CHECK_EQ(channels * outLength, this->getWidth());
-  size_t outStride = getStride();
-
-  /* initialize the data_ */
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      outData[(i)*outStride + j] = -(real)FLT_MAX;
-      maxPoolIdxData[(i)*outStride + j] = -1;
-    }
-  }
-
-  /* pool max one by one */
-  for (size_t n = 0; n < num; ++n) {  // frame by frame
-    if (!isContiguous()) {
-      outData = getData() + n * outStride;
-      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {  // channel by channel
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-            int maxIdx = -1;
-            real maxOutData = outData[(pd * outputH + ph) * outputW + pw];
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  if (maxOutData <
-                      inputData[(d * imgSizeH + h) * imgSizeW + w]) {
-                    maxOutData = inputData[(d * imgSizeH + h) * imgSizeW + w];
-                    maxIdx = (d * imgSizeH + h) * imgSizeW + w;
-                  }
-                }
-              }
-            }
-            outData[(pd * outputH + ph) * outputW + pw] = maxOutData;
-            maxPoolIdxData[(pd * outputH + ph) * outputW + pw] = maxIdx;
-          }
-        }
-      }
-      // compute offset
-      inputData += inLength;
-      outData += outLength;
-      maxPoolIdxData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPool3DBackward(Matrix& outGrad,
-                                  Matrix& maxPoolIdx,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  size_t num = getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  size_t channels = size_t(width_ / inLength);
-  CHECK(maxPoolIdx.getHeight() == outGrad.getHeight() &&
-        maxPoolIdx.getWidth() == outGrad.getWidth());
-
-  real* tgtGrad = getData();
-  real* otGrad = outGrad.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t outStride = outGrad.getStride();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!outGrad.isContiguous()) {
-      otGrad = outGrad.getData() + n * outStride;
-      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            const size_t index = (pd * outputH + ph) * outputW + pw;
-            const size_t tgtIdx = static_cast<size_t>(maxPoolIdxData[index]);
-            tgtGrad[tgtIdx] =
-                scaleTargets * tgtGrad[tgtIdx] + scaleOutput * otGrad[index];
-          }
-        }
-      }
-      // offset
-      tgtGrad += inLength;
-      otGrad += outLength;
-      maxPoolIdxData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPool3DForward(Matrix& input,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  // The main loop
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  CHECK(inLength * channels == input.getWidth());
-  CHECK(outLength * channels * num == height_ * width_);
-  real* tgtData = getData();
-  real* inData = input.getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!isContiguous()) {
-      tgtData = data_ + n * getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-
-            tgtData[(pd * outputH + ph) * outputW + pw] = 0;  // clear
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  tgtData[(pd * outputH + ph) * outputW + pw] +=
-                      inData[(d * imgSizeH + h) * imgSizeW + w];
-                }
-              }
-            }
-            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            CHECK(poolSize);
-            tgtData[(pd * outputH + ph) * outputW + pw] /= poolSize;
-          }
-        }
-      }
-      // compute offset
-      inData += inLength;
-      tgtData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPool3DBackward(Matrix& input,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  size_t channels = input.getWidth() / outLength;
-  CHECK(inLength * channels == getWidth());
-  real* inData = input.getData();
-  real* outData = getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!input.isContiguous()) {
-      inData = input.getData() + n * input.getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            CHECK(poolSize);
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  outData[(d * imgSizeH + h) * imgSizeW + w] +=
-                      inData[(pd * outputH + ph) * outputW + pw] / poolSize;
-                }
-              }
-            }
-          }
-        }
-      }
-      // offset
-      outData += inLength;
-      inData += outLength;
-    }
-  }
-}
-
-/**
- * Input: one or more sequences. Each sequence contains some instances.
- * Output: output size is the number of input sequences (NOT input instances).
- * output[i] is set to max_{for each instance in this sequence}{input[i]}
- */
-void CpuMatrix::maxSequenceForward(Matrix& input,
-                                   const IVector& sequence,
-                                   IVector& index) {
-  CHECK(dynamic_cast<CpuMatrix*>(&input));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-  CHECK(dynamic_cast<CpuIVector*>(&index));
-
-  real* outData = getData();
-  real* inputData = input.getData();
-  const int* starts = sequence.getData();
-  int* maxIndex = index.getData();
-  size_t numSequences = getHeight();
-  size_t dim = getWidth();
-
-  CHECK_EQ(dim, input.getWidth());
-  CHECK_EQ(numSequences, sequence.getSize() - 1);
-  CHECK_EQ(starts[numSequences], (int)input.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-    // current sequence, loop for each input instance
-    // (1) first instance: do not need compare, copy value to outV directly
-    for (size_t k = 0; k < dim; ++k) {
-      outData[sequenceId * dim + k] = inputData[starts[sequenceId] * dim + k];
-      maxIndex[sequenceId * dim + k] = starts[sequenceId];
-    }
-    // (2) other instance in same sequence
-    for (int insId = starts[sequenceId] + 1; insId < starts[sequenceId + 1];
-         ++insId) {
-      // insId is the index on all instances
-      for (size_t k = 0; k < dim; ++k) {
-        // for each dim
-        if (inputData[insId * dim + k] > outData[sequenceId * dim + k]) {
-          // update max value and record index
-          outData[sequenceId * dim + k] = inputData[insId * dim + k];
-          maxIndex[sequenceId * dim + k] = insId;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
-                                    const IVector& sequence,
-                                    IVector& index) {
-  CHECK(dynamic_cast<CpuMatrix*>(&outputGrad));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-  CHECK(dynamic_cast<CpuIVector*>(&index));
-
-  real* inputGrad = getData();
-  real* outGrad = outputGrad.getData();
-  int* maxIndex = index.getData();
-  size_t dim = getWidth();
-  size_t numSequences = sequence.getSize() - 1;
-
-  CHECK_EQ(dim, outputGrad.getWidth());
-  CHECK_EQ(numSequences, outputGrad.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-    // current sequence
-    for (size_t j = 0; j < dim; ++j) {
-      // each dim
-      int insId = maxIndex[sequenceId * dim + j];
-      inputGrad[insId * dim + j] += outGrad[sequenceId * dim + j];
-    }
-  }
-}
-
-inline void vecAddTo(real* a, const real* b, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += b[i];
-  }
-}
-
-inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += scaleB * b[i];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, const real* b, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += b[i * bWidth];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += b[i * bWidth] * c;
-  }
-}
-
-void CpuMatrix::addBias(Matrix& b, real scale) {
-  CHECK(b.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  CHECK_EQ(width_, b.getWidth());
-  real* aData = getData();
-  real* bData = b.getData();
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-
-  if (scale == 1 && getStride() % 32 == 0) {  // use libaddto
-    // @TODO(yuyang18) Make input addr can be unaligned.
-    // So merge this if and else
-    CHECK_EQ((size_t)aData % 32, 0UL);
-    CHECK_EQ((size_t)bData % 32, 0UL);
-    for (size_t i = 0; i < numSamples; i++) {
-      simd::addTo(aData + i * getStride(), bData, dim);
-    }
-  } else {
-    for (size_t i = 0; i < numSamples; i++) {
-      for (size_t j = 0; j < dim; j++) {
-        aData[i * getStride() + j] += scale * bData[j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::addSharedBias(Matrix& b, real scale) {
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  real* aData = getData();
-  real* bData = b.getData();
-  size_t numSamples = getHeight();
-  size_t channel = b.getWidth();
-  CHECK_EQ(getWidth() % channel, 0UL);
-  size_t dim = getWidth() / channel;
-
-  for (size_t i = 0; i < numSamples; i++) {
-    for (size_t c = 0; c < channel; c++) {
-      for (size_t j = 0; j < dim; j++) {
-        aData[i * getStride() + c * dim + j] += scale * bData[c];
-      }
-    }
-  }
-}
-
-void CpuMatrix::collectBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(width_, a.getWidth());
-  CpuSparseMatrix* aptr = dynamic_cast<CpuSparseMatrix*>(&a);
-  if (!aptr) {
-    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
-  } else {
-    size_t nnz = aptr->getElementCnt();
-    int* cols = aptr->getCols();
-    real* A = aptr->getValue();
-    real* B = getData();
-    for (size_t i = 0; i < nnz; i++) {
-      B[cols[i]] += scale * A[i];
-    }
-  }
-}
-
-void CpuMatrix::collectSharedBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  real* B = getData();
-  real* A = a.getData();
-  size_t numSamples = a.getHeight();
-  size_t channel = getWidth();
-  CHECK_EQ(a.getWidth() % channel, 0UL);
-  size_t dim = a.getWidth() / channel;
-  for (size_t i = 0; i < numSamples; i++) {
-    for (size_t c = 0; c < channel; c++) {
-      for (size_t j = 0; j < dim; j++) {
-        B[c] += scale * A[i * channel * dim + c * dim + j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::sequenceAvgForward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
-  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
-  for (size_t i = 0; i < height; i++) {
-    int sequenceLength = starts[i + 1] - starts[i];
-    if (0 == sequenceLength) {
-      // empty sequence
-      continue;
-    }
-    outMtx->setData(dst + i * width);
-    dataMtx->setData(src + starts[i] * width, sequenceLength, width);
-    if (mode == 0) {
-      // plain average
-      outMtx->sumCols(*dataMtx,
-                      (real)1 / (real)sequenceLength,
-                      /* scaleDest= */ 1);
-    } else if (mode == 1) {
-      // sum instead of average
-      outMtx->sumCols(*dataMtx, /* scaleSum= */ 1, /* scaleDest= */ 1);
-    } else if (mode == 2) {
-      // divide by square root of sequenceLength
-      outMtx->sumCols(*dataMtx,
-                      (real)1 / std::sqrt(sequenceLength),
-                      /* scaleDest= */ 1);
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-void CpuMatrix::sequenceAvgBackward(Matrix& a,
-                                    const IVector& startsPos,
-                                    int mode) {
-  size_t height = a.getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
-  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
-  for (size_t i = 0; i < height; ++i) {
-    int sequenceLength = starts[i + 1] - starts[i];
-    if (0 == sequenceLength) {
-      // empty sequence
-      continue;
-    }
-    outMtx->setData(dst + starts[i] * width, sequenceLength, width);
-    dataMtx->setData(src + i * width);
-    if (mode == 0) {
-      // plain average
-      outMtx->addBias(*dataMtx, 1.0f / sequenceLength);
-    } else if (mode == 1) {
-      // sum instead of average
-      outMtx->addBias(*dataMtx, 1.0f);
-    } else if (mode == 2) {
-      // divide by square root of sequenceLength
-      outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength));
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-/* this = scaleAB*(a*b) + scaleT*this*/
-void CpuMatrix::mul(const Matrix& a,
-                    const Matrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
-  const auto a_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&a);
-  const auto b_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, scaleAB, scaleT);
-  } else if (a_ptr_s && b_ptr) {
-    mul((CpuSparseMatrix*)a_ptr_s, (CpuMatrix*)b_ptr, scaleAB, scaleT);
-  } else if (a_ptr && b_ptr_s) {
-    mul((CpuMatrix*)a_ptr, (CpuSparseMatrix*)b_ptr_s, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-void CpuMatrix::mul(CpuSparseMatrix* a,
-                    CpuMatrix* b,
-                    real scaleAB,
-                    real scaleT) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(b)) {
-    return mul(a, dynamic_cast<CacheRowCpuMatrix*>(b), this, scaleAB, scaleT);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(b)) {
-    return mul(a, dynamic_cast<SparseRowCpuMatrix*>(b), this, scaleAB, scaleT);
-  } else {
-    return mul(a, b, this, scaleAB, scaleT);
-  }
-}
-
-void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-
-  size_t a_col, b_col, a_row, b_row;
-  bool a_trans, b_trans;
-  if (!a->isTransposed()) {
-    a_col = a->getWidth();
-    a_row = a->getHeight();
-    a_trans = false;
-  } else {
-    a_col = a->getHeight();
-    a_row = a->getWidth();
-    a_trans = true;
-  }
-  if (!b->isTransposed()) {
-    b_col = b->getWidth();
-    b_row = b->getHeight();
-    b_trans = false;
-  } else {
-    b_col = b->getHeight();
-    b_row = b->getWidth();
-    b_trans = true;
-  }
-
-  CHECK_EQ(a_col, b_row);
-  CHECK_EQ(a_row, getHeight());
-  CHECK_EQ(b_col, getWidth());
-
-  real* A = a->getData();
-  real* B = b->getData();
-  real* C = getData();
-
-  int M = getHeight();
-  int N = getWidth();
-  int K = a_col;
-  int lda = a->getStride();
-  int ldb = b->getStride();
-  int ldc = getStride();
-  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
-      a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
-}
-
-void CpuMatrix::mul(
-    CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB, real scaleT) {
-  CHECK(!c->isTransposed()) << "Not supported";
-  CHECK_EQ(c->getValueType(), FLOAT_VALUE);
-
-  real* A = a->getData();
-  real* B = b->getData();
-  real* C = c->getValue();
-  int* rows = c->getRows();
-  int* cols = c->getCols();
-  size_t height = c->getHeight();
-  size_t width = c->getWidth();
-  if (scaleT == 0) {
-    c->zeroMem();
-  }
-
-  if (!a->isTransposed() && !b->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getWidth(), width);
-    if (c->getFormat() == SPARSE_CSC) {
-      for (size_t i = 0; i < width; i++) {
-        size_t start = c->getColStartIdx(i);
-        size_t end = c->getColStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t rowIdx = rows[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[rowIdx * m + k] * B[k * width + i];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height; i++) {
-        size_t start = c->getRowStartIdx(i);
-        size_t end = c->getRowStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[i * m + k] * B[k * width + colIdx];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    }
-  } else if (a->isTransposed() && !b->isTransposed()) {
-    size_t m = a->getHeight();
-    CHECK_EQ(m, b->getHeight());
-    CHECK_EQ(b->getWidth(), width);
-    CHECK_EQ(a->getWidth(), height);
-
-    if (c->getFormat() == SPARSE_CSC) {
-      for (size_t i = 0; i < width; i++) {
-        size_t start = c->getColStartIdx(i);
-        size_t end = c->getColStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t rowIdx = rows[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[k * height + rowIdx] * B[k * width + i];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height; i++) {
-        int start = c->getRowStartIdx(i);
-        int end = c->getRowStartIdx(i + 1);
-        for (int j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[k * height + i] * B[k * width + colIdx];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    }
-  } else if (!a->isTransposed() && b->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getWidth(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getHeight(), width);
-    if (c->getFormat() == SPARSE_CSR) {
-      for (size_t i = 0; i < height; i++) {
-        size_t start = c->getRowStartIdx(i);
-        size_t end = c->getRowStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[i * m + k] * B[colIdx * m + k];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      LOG(FATAL) << "Not supported csc format "
-                    "when a is not trans and b is trans";
-    }
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-void CpuMatrix::mul(CpuMatrix* a,
-                    CpuSparseMatrix* b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!trans_) << "Not supported";
-  CHECK(!a->isTransposed()) << "Not supported";
-  CHECK(scaleT == 0 || scaleT == 1);
-
-  // TODO(yuyang18): Maybe bug implementation here
-  CHECK_EQ(scaleAB, static_cast<real>(1.0));
-
-  real* A = a->getData();
-  real* B = b->getValue();
-  real* C = getData();
-  int* rows = b->getRows();
-  int* cols = b->getCols();
-
-  if (scaleT == 0) {
-    zeroMem();
-  }
-  if (b->getFormat() == SPARSE_CSC) {
-    if (!b->isTransposed()) {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), m);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), width_);
-
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t j = 0; j < b->getWidth(); ++j) {
-          int start = b->getColStartIdx(j);
-          int end = b->getColStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(C + j, A + rows[i], height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t j = 0; j < b->getWidth(); ++j) {
-          int start = b->getColStartIdx(j);
-          int end = b->getColStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(
-                C + j, A + rows[i], B[i], height_, width_, a->getWidth());
-          }
-        }
-      }
-    } else /*if (b->isTransposed())*/ {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), width_);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), m);
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t i = 0; i < b->getWidth(); ++i) {
-          int start = b->getColStartIdx(i);
-          int end = b->getColStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(C + rows[j], A + i, height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t i = 0; i < b->getWidth(); ++i) {
-          int start = b->getColStartIdx(i);
-          int end = b->getColStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(
-                C + rows[j], A + i, B[j], height_, width_, a->getWidth());
-          }
-        }
-      }
-    }
-  } else {
-    if (!b->isTransposed()) {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), m);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), width_);
-
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t j = 0; j < b->getHeight(); ++j) {
-          int start = b->getRowStartIdx(j);
-          int end = b->getRowStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(C + cols[i], A + j, height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t j = 0; j < b->getHeight(); ++j) {
-          int start = b->getRowStartIdx(j);
-          int end = b->getRowStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(
-                C + cols[i], A + j, B[i], height_, width_, a->getWidth());
-          }
-        }
-      }
-    } else /*if (b->isTransposed())*/ {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), width_);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), m);
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t i = 0; i < b->getHeight(); ++i) {
-          int start = b->getRowStartIdx(i);
-          int end = b->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(C + i, A + cols[j], height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t i = 0; i < b->getHeight(); ++i) {
-          int start = b->getRowStartIdx(i);
-          int end = b->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(
-                C + i, A + cols[j], B[j], height_, width_, a->getWidth());
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::selectRows(Matrix& table, IVector& ids) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
-    selectRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
-    selectRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
-  } else {
-    CHECK(table.isContiguous());
-    selectRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
-  }
-}
-
-void CpuMatrix::selectElements(Matrix& table, IVector& ids) {
-  CHECK_EQ(table.getHeight(), ids.getSize());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), 1U);
-  real* tableData = table.getData();
-  int* idsData = ids.getData();
-  for (size_t i = 0; i < table.getHeight(); i++) {
-    data_[i] += tableData[i * table.getWidth() + idsData[i]];
-  }
-}
-
-void CpuMatrix::addElements(Matrix& table, IVector& ids) {
-  CHECK_EQ(table.getHeight(), ids.getSize());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), 1U);
-  real* tableData = table.getData();
-  int* idsData = ids.getData();
-  for (size_t i = 0; i < table.getHeight(); i++) {
-    tableData[i * table.getWidth() + idsData[i]] += data_[i];
-  }
-}
-
-// this.row[i] += table.row[ids[i]]
-template <typename TableMatType>
-void CpuMatrix::selectRowsImp(TableMatType& table, IVector& ids) {
-  CHECK(!table.useGpu());
-  CHECK(!ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  for (size_t i = 0; i < numSamples; ++i) {
-    if (index[i] == -1) continue;
-    CHECK_LT(index[i], (int)tableSize);
-    CHECK_GE(index[i], 0);
-    vecAddTo(a + i * stride_, table.getRow(index[i]), dim);
-  }
-}
-
-void CpuMatrix::addToRows(Matrix& table, IVector& ids) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
-  } else {
-    CHECK(table.isContiguous());
-    addToRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
-  }
-}
-
-// table.row[ids[i]] += this.row[i]
-template <typename TableMatType>
-void CpuMatrix::addToRowsImp(TableMatType& table, IVector& ids) {
-  CHECK(!table.useGpu());
-  CHECK(!ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  for (size_t i = 0; i < numSamples; ++i) {
-    if (index[i] == -1) continue;
-    CHECK_LT(index[i], (int)tableSize);
-    CHECK_GE(index[i], 0);
-    vecAddTo(table.getRow(index[i]), a + i * stride_, dim);
-  }
-}
-
-static ThreadLocal<std::vector<const real*>> threadLocalColArray;
-
-template <typename MatBType, typename MatCType>
-void CpuMatrix::mul(
-    CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT) {
-  CHECK(!c->isTransposed()) << "Not supported";
-  CHECK(!b->isTransposed()) << "Not supported";
-  // TODO(yuyang18): Maybe bug implementation here.
-  CHECK(scaleAB == 1) << "Not supported";
-  CHECK(scaleT == 0 || scaleT == 1) << "Not supported";
-  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "Not supported";
-
-  real* B = b->getData();
-  real* C = c->getData();
-  size_t height = c->getHeight();
-  size_t width = c->getWidth();
-  int* cols = a->getCols();
-  real* values = a->getValue();
-
-  if (scaleT == 0) {
-    c->zeroMem();
-  }
-
-  if (!a->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getWidth(), width);
-
-    if (a->getValueType() == NO_VALUE) {
-      if (width % 32 == 0) {  // use libaddto
-        // @TODO(yuyang18) Make input addr can be unaligned.
-        // So merge this if and else
-        CHECK_EQ((size_t)B % 32, 0UL);
-        CHECK_EQ((size_t)C % 32, 0UL);
-        auto& colArray = *threadLocalColArray;
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          size_t colNum = end - start;
-          colArray.resize(colNum);
-          for (int j = 0; j < end - start; ++j) {
-            colArray[j] = b->getRow(cols[j + start]);
-          }
-          simd::batchAddTo(c->getRow(i), &colArray[0], colNum, width);
-        }
-
-      } else {
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            vecAddTo(c->getRow(i), b->getRow(cols[j]), width);
-          }
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = 0; i < a->getHeight(); ++i) {
-        const int start = a->getRowStartIdx(i);
-        const int end = a->getRowStartIdx(i + 1);
-        for (int j = start; j < end; ++j) {
-          vecAddTo(c->getRow(i), b->getRow(cols[j]), values[j], width);
-        }
-      }
-    }
-  } else /*if (a->isTransposed())*/ {
-    size_t m = a->getHeight();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getWidth(), height);
-    CHECK_EQ(b->getWidth(), width);
-    if (a->getValueType() == NO_VALUE) {
-      if (width % 32 == 0) {  // use libaddto
-        // @TODO(yuyang18) Make input addr can be unaligned.
-        // So merge this if and else
-        CHECK_EQ((size_t)B % 32, 0UL);
-        CHECK_EQ((size_t)C % 32, 0UL);
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            simd::addTo(c->getRow(cols[j]), b->getRow(i), width);
-          }
-        }
-
-      } else {
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            vecAddTo(c->getRow(cols[j]), b->getRow(i), width);
-          }
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = 0; i < a->getHeight(); ++i) {
-        const int start = a->getRowStartIdx(i);
-        const int end = a->getRowStartIdx(i + 1);
-        for (int j = start; j < end; ++j) {
-          vecAddTo(c->getRow(cols[j]), b->getRow(i), values[j], width);
-        }
-      }
-    }
-  }
-}
-
-// instantiation mul() called in SparseRowMatrix.cpp
-template void CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(
-    CpuSparseMatrix* a,
-    CpuMatrix* b,
-    SparseRowCpuMatrix* c,
-    real scaleAB,
-    real scaleT);
-template void CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
-    CpuSparseMatrix* a,
-    CpuMatrix* b,
-    SparseAutoGrowRowCpuMatrix* c,
-    real scaleAB,
-    real scaleT);
-template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
-                                                           CpuMatrix* b,
-                                                           CacheRowCpuMatrix* c,
-                                                           real scaleAB,
-                                                           real scaleT);
-
-#ifndef PADDLE_MOBILE_INFERENCE
-void SharedCpuMatrix::mul(CpuSparseMatrix* a,
-                          CpuMatrix* b,
-                          real scaleAB,
-                          real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!b->isTransposed()) << "Not supported";
-  CHECK_EQ(scaleAB, 1) << "Not supported";
-  CHECK_EQ(scaleT, 1) << "Not supported";
-  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "not supported";
-
-  real* B = b->getData();
-  real* C = getData();
-  size_t height = getHeight();
-  size_t width = getWidth();
-
-  // get real trans
-  MatrixPtr aTrans;
-  if (a->isTransposed()) {
-    aTrans = a->getTmpSparseMatrix(a->getWidth(), a->getHeight());
-    a->transpose(aTrans, false);
-  }
-  a = dynamic_cast<CpuSparseMatrix*>(aTrans.get());
-
-  size_t m = a->getWidth();
-  CHECK_EQ(b->getHeight(), m);
-  CHECK_EQ(a->getHeight(), height);
-  CHECK_EQ(b->getWidth(), width);
-
-  size_t blockSize = (height / blockNum_) + 1;
-  CpuMatrixPtr localBuf = *localBuf_;
-  if (!localBuf) {
-    localBuf = std::make_shared<CpuMatrix>(blockSize, width);
-  } else {
-    localBuf->resize(blockSize, width);
-  }
-  localBuf->zeroMem();
-  real* localC = localBuf->getData();
-  std::vector<int>& blockSeq = *blockSeq_;
-  if (blockSeq.size() == 0) {
-    for (int k = 0; k < blockNum_; ++k) {
-      blockSeq.push_back(k);
-    }
-    std::shuffle(
-        blockSeq.begin(), blockSeq.end(), ThreadLocalRandomEngine::get());
-  }
-  std::vector<int>& localBufRows = *localBufRows_;
-  int* cols = a->getCols();
-  real* value = a->getValue();
-
-  for (int k = 0; k < blockNum_; ++k) {
-    int blockId = blockSeq[k];
-    size_t blockBegin = blockId * blockSize;
-    size_t blockEnd = (blockId + 1) * blockSize;
-    if (blockId == blockNum_ - 1) {
-      blockEnd = height;
-    }
-    if (a->getValueType() == NO_VALUE) {
-      for (size_t i = blockBegin; i < blockEnd; ++i) {
-        int start = a->getRowStartIdx(i);
-        int end = a->getRowStartIdx(i);
-        size_t colNum = a->getColNum(i);
-        if (colNum == 0) {
-          continue;
-        }  // skip empty row
-        localBufRows.push_back(i);
-        size_t bufPos = localBufRows.size() - 1;
-        for (int j = start; j < end; ++j) {
-          vecAddTo(localC + bufPos * width, B + cols[j] * width, width);
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = blockBegin; i < blockEnd; ++i) {
-        int start = a->getRowStartIdx(i);
-        int end = a->getRowStartIdx(i);
-        size_t colNum = a->getColNum(i);
-        if (colNum == 0) {
-          continue;
-        }  // skip empty row
-        localBufRows.push_back(i);
-        size_t bufPos = localBufRows.size() - 1;
-        for (int j = start; j < end; ++j) {
-          vecAddTo(
-              localC + bufPos * width, B + cols[j] * width, value[j], width);
-        }
-      }
-    }
-
-    {
-      std::lock_guard<std::mutex> guard(*blockLocks_[blockId]);
-      for (size_t i = 0; i < localBufRows.size(); ++i) {
-        vecAddTo(C + localBufRows[i] * width, localC + i * width, width);
-      }
-    }
-    memset(localC, 0, localBufRows.size() * width * sizeof(real));
-    localBufRows.clear();
-  }
-
-  VLOG(2) << " B[0]=" << B[0] << " B[1]=" << B[1] << " C[0]=" << C[0]
-          << " C[1]=" << C[1];
-}
-
-void SharedCpuMatrix::add(Matrix& b, real p1, real p2) {
-  CHECK_EQ(blockNum_, 1);
-  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
-  CpuMatrix::add(b, p1, p2);
-}
-
-void SharedCpuMatrix::add(real p1, real p2) {
-  CHECK_EQ(blockNum_, 1);
-  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
-  CpuMatrix::add(p1, p2);
-}
-
-void SharedCpuMatrix::initShared(int blockNum) {
-  CHECK_GT(height_ * width_, 1UL * 1024 * 1024)
-      << "should not share small matrix";
-  initBlock(blockNum);
-}
-
-void SharedCpuMatrix::initBlock(int blockNum) {
-  CHECK_LE(blockNum, 200) << "should not use large block number";
-  blockNum_ = blockNum;
-  blockLocks_.resize(blockNum);
-  for (auto& locker : blockLocks_) {
-    locker.reset(new std::mutex);
-  }
-}
-
-#endif
-/* Add a (column) vector b to matrix a, column by column */
-void CpuMatrix::addColumnVector(const Matrix& b) {
-  BaseMatrix::addColVector(const_cast<Matrix&>(b));
-}
-
-/* this = a*b */
-void CpuMatrix::mul(const Matrix& a, const Matrix& b) {
-  return mul(a, b, 1.0, 0.0);
-}
-
-/* this = scaleAB*(this*b) +  scaleT*this */
-void CpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
-  (void)b;
-  (void)scaleAB;
-  (void)scaleT;
-  LOG(FATAL) << "Not implemented";
-}
-
-/* this = this* b */
-void CpuMatrix::rightMul(Matrix& b) { return rightMul(b, 1.0, 0.0); }
-
-/* this = scaleAB*(a*this) +  scaleT*this */
-void CpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
-  (void)a;
-  (void)scaleAB;
-  (void)scaleT;
-  LOG(FATAL) << "Not implemented";
-}
-
-/* this = a*this) */
-void CpuMatrix::leftMul(Matrix& a) { return leftMul(a, 1.0, 0.0); }
-
-void CpuMatrix::colMerge(Matrix& src) { src.rowSum(*this); }
-
-void CpuMatrix::rowSum(Matrix& sum) {
-  CHECK_EQ(sum.getHeight(), getHeight());
-  CHECK_EQ(sum.getWidth(), (size_t)1);
-
-  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
-}
-
-void CpuMatrix::rowMaxId(IVector& maxIds) {
-  CHECK(!maxIds.useGpu()) << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(maxIds.getSize(), numSamples);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  size_t dim = getWidth();
-
-  for (size_t i = 0; i < numSamples; i++) {
-    real sm = a[i * dim];
-    int maxId = 0;
-    for (size_t j = 1; j < dim; j++) {
-      if (a[i * dim + j] > sm) {
-        maxId = j;
-        sm = a[i * dim + j];
-      }
-    }
-    s[i] = maxId;
-  }
-}
-
-void CpuMatrix::rowMax(Matrix& max) {
-  CHECK_EQ(max.getHeight(), getHeight());
-  CHECK_EQ(max.getWidth(), (size_t)1);
-  max.maxRows(*this);
-}
-
-/* Get the top k elements of each row of this matrix */
-void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-  CHECK(isContiguous());
-  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  CHECK_EQ(maxVal.getWidth(), beam);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  real* t = maxVal.getData();
-  size_t dim = getWidth();
-  for (size_t i = 0; i < numSamples; i++) {
-    std::vector<std::pair<real, size_t>> vec;
-    for (size_t j = 0; j < dim; j++) {
-      vec.push_back(std::pair<real, size_t>(a[i * dim + j], j));
-    }
-
-    std::partial_sort(
-        vec.begin(),
-        vec.begin() + beam,
-        vec.end(),
-        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
-          return l.first > r.first;
-        });
-    for (size_t j = 0; j < beam; j++) {
-      t[i * beam + j] = vec[j].first;
-      s[i * beam + j] = vec[j].second;
-    }
-  }
-}
-
-void CpuMatrix::colMax(Matrix& max) {
-  CHECK_EQ(max.getWidth(), getWidth());
-  CHECK_EQ(max.getHeight(), (size_t)1);
-  max.maxCols(*this);
-}
-
-void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
-  CHECK(isContiguous());
-  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getWidth();
-  size_t beam = maxVal.getHeight();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getWidth(), numSamples);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  real* t = maxVal.getData();
-  size_t dim = getHeight();
-  for (size_t i = 0; i < numSamples; i++) {
-    std::vector<std::pair<real, size_t>> vec;
-    for (size_t j = 0; j < dim; j++) {
-      vec.push_back(std::pair<real, size_t>(a[i + j * numSamples], j));
-    }
-
-    std::partial_sort(
-        vec.begin(),
-        vec.begin() + beam,
-        vec.end(),
-        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
-          return l.first > r.first;
-        });
-    for (size_t j = 0; j < beam; j++) {
-      t[i + j * numSamples] = vec[j].first;
-      s[i + j * numSamples] = vec[j].second;
-    }
-  }
-}
-
-void CpuMatrix::maxoutForward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-  CHECK(dynamic_cast<CpuMatrix*>(&a));
-  CHECK(dynamic_cast<CpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = getWidth();
-  size_t batchSize = getHeight();
-  size_t featLen = size / channels;
-  const real* input = a.getData();
-  int* idForCpu = id.getData();
-
-  MatrixPtr maxInMat, maxOutMat;
-  Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
-  Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
-
-  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
-    size_t newIndex = batch_idx * size;
-    IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
-
-    for (size_t i = 0; i < channels; ++i) {
-      size_t newFeatLen = i * featLen;
-      for (size_t j = 0; j < groups; ++j) {
-        maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
-            ->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
-                       featLen);
-      }
-    }
-    maxInMat->colMax(*tmpId, *maxOutMat);
-    this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
-  }
-}
-
-void CpuMatrix::maxoutBackward(Matrix& a,
-                               IVector& id,
-                               size_t channels,
-                               size_t groups) {
-  CHECK(dynamic_cast<CpuMatrix*>(&a));
-  CHECK(dynamic_cast<CpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = a.getWidth();
-  size_t batchSize = getHeight();
-  size_t featLen = size / channels;
-  size_t newFeatLen = groups * featLen;
-  real* inputG = getData();
-  const real* outG = a.getData();
-  int* idForCpu = id.getData();
-
-  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
-    size_t newIndex = batch_idx * size;
-    int* idData = idForCpu + newIndex;
-
-    for (size_t i = 0; i < size; ++i) {
-      int gradIdx =
-          idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
-      (inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
-    }
-  }
-}
-
-void CpuMatrix::rowNormalizeL1(Matrix& out) {
-  CHECK(!out.useGpu());
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(out.getHeight(), numSamples);
-  CHECK_EQ(out.getWidth(), dim);
-  real* a = getData();
-  real* b = out.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    real s = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      s += a[i * dim + j];
-    }
-    // Right now, we just bet that sum won't be zero. If this really happens,
-    // we will figure out what should be done then.
-    CHECK_GT(s, 0);
-    s = 1 / s;
-    for (size_t j = 0; j < dim; ++j) {
-      b[i * dim + j] = s * a[i * dim + j];
-    }
-  }
-}
-
-/* calulate classification error */
-void CpuMatrix::classificationError(Matrix& output,
-                                    IVector& label,
-                                    size_t topkSize) {
-  size_t numSamples = this->getHeight();
-  auto cpuOutput = dynamic_cast<CpuMatrix*>(&output);
-  auto cpuLabel = dynamic_cast<CpuIVector*>(&label);
-  IVectorPtr cpuTopIds = std::make_shared<CpuIVector>(numSamples * topkSize);
-  MatrixPtr cpuTopVal = std::make_shared<CpuMatrix>(numSamples, topkSize);
-
-  CHECK(cpuOutput && cpuLabel) << "Invalid argument pointer";
-  CHECK(cpuTopIds && cpuTopVal) << "Allocate cpu memory failed";
-  CHECK(cpuLabel->getSize() == numSamples) << "Vector size is not equal";
-  CHECK(cpuOutput->getHeight() == numSamples && this->getWidth() == 1)
-      << "Matrix dimensions are not equal";
-
-  // top k matrix classification
-  cpuOutput->rowMax(*cpuTopIds, *cpuTopVal);
-
-  size_t dim = cpuOutput->getWidth();
-  real* result = this->getData();
-  int* ids = cpuTopIds->getData();
-  int* lbl = cpuLabel->getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-
-    for (size_t j = 0; j < topkSize; ++j) {
-      if (ids[j + i * topkSize] == lbl[i]) {
-        result[i] = 0;
-        break;
-      }
-      result[i] = 1.0f;
-    }
-  }
-}
-
-/* copy -log(output[label]) to this->data[i] */
-void CpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* out = output.getData();
-  real* cost = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-    cost[i] = -std::log(out[lbl[i]]);
-  }
-}
-
-/* calculate the error of outputV according to label */
-void CpuMatrix::oneHotCrossEntropyBp(Matrix& output, IVector& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  real* out = output.getData();
-  real* grad = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    grad[lbl[i]] -= 1 / out[lbl[i]];
-  }
-}
-
-/*
-    We implement the matrix functionality in CostLayer.cpp,
-    but we define the scalar function here for sanity check
-    deletion of the function does not affect anything neverthelss
-*/
-void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                               IVector& label,
-                                               real alpha) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* out = output.getData();
-  real* cost = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    sum = _safelog(sum);
-    cost[i] = -_safelog(out[lbl[i]]) + sum + alpha * _square(sum);
-  }
-}
-
-/*
-    We implement the matrix functionality in CostLayer.cpp,
-    but we define the scalar function here for sanity check
-    deletion of the function does not affect anything neverthelss
-*/
-void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output,
-                                                 IVector& label,
-                                                 real alpha) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  real* out = output.getData();
-  real* grad = getData();
-  int* lbl = label.getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    grad[lbl[i]] -= 1 / out[lbl[i]];
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    for (size_t j = 0; j < dim; ++j) {
-      if (j == (size_t)lbl[i]) {
-        grad[j] += -1 / out[j];
-      }
-      grad[j] += 1 / sum + 2 * alpha * _safelog(sum) / sum;
-    }
-  }
-}
-
-#define FORWARD_LOOP()                      \
-  size_t numSamples = getHeight();          \
-  size_t dim = getWidth();                  \
-  CHECK_EQ(output.getHeight(), numSamples); \
-  CHECK_EQ(output.getWidth(), dim);         \
-  const real* in = getData();               \
-  real* out = output.getData();             \
-  for (size_t i = 0; i < numSamples; ++i, in += dim, out += dim)
-
-#define BACKWARD_LOOP()                     \
-  size_t numSamples = getHeight();          \
-  size_t dim = getWidth();                  \
-  CHECK_EQ(output.getHeight(), numSamples); \
-  CHECK_EQ(output.getWidth(), dim);         \
-  real* grad = getData();                   \
-  real* out = output.getData();             \
-  for (size_t i = 0; i < numSamples; ++i, grad += dim, out += dim)
-
-void CpuMatrix::softmax(Matrix& output) {
-  CHECK(!output.useGpu());
-
-  const float THRESHOLD = -64.0;
-
-  FORWARD_LOOP() {
-    real max = -1.0e20;
-    for (size_t j = 0; j < dim; ++j) {
-      if (in[j] > max) {
-        max = in[j];
-      }
-    }
-    for (size_t j = 0; j < dim; ++j) {
-      real a = in[j] - max;
-      if (a < THRESHOLD) {
-        a = THRESHOLD;
-      }
-      out[j] = a;
-    }
-    vExp(dim, out, out);
-
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    sum = 1 / sum;
-    for (size_t j = 0; j < dim; ++j) {
-      out[j] *= sum;
-    }
-  }
-}
-
-void CpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-  CHECK(isContiguous());
-
-  MatrixPtr inTmp = Matrix::create(nullptr,
-                                   /* height= */ 1,
-                                   1,
-                                   /* trans= */ false,
-                                   false);
-  MatrixPtr outTmp = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    1,
-                                    /* trans= */ false,
-                                    false);
-  size_t numSequences = index.getSize() - 1;
-  auto starts = index.getData();
-  for (size_t i = 0; i < numSequences; ++i) {
-    size_t offset = starts[i];
-    size_t size = starts[i + 1] - starts[i];
-    inTmp->setData(getData() + offset, 1UL, size);
-    outTmp->setData(output.getData() + offset, 1UL, size);
-    inTmp->softmax(*outTmp);
-  }
-}
-
-void CpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
-  CHECK_EQ(getHeight(), sftmaxSum.getHeight());
-
-  real* sums = sftmaxSum.getData();
-
-  BACKWARD_LOOP() {
-    real sum = sums[i];
-    for (size_t j = 0; j < dim; ++j) {
-      grad[j] = out[j] * (grad[j] - sum);
-    }
-  }
-}
-
-void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), (size_t)1);
-  real* out = output.getData();
-  real* cost = getData();
-
-  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
-  if (labelptr) {
-    // it is a CpuSparseMatrix
-    if (labelptr->getFormat() == SPARSE_CSR) {
-      // treat label as a SparseMatrix
-      for (size_t i = 0; i < numSamples; ++i) {
-        for (size_t j = 0; j < dim; ++j) {
-          cost[i] += _square(out[i * dim + j]);
-        }
-      }
-      if (labelptr->getValueType() == NO_VALUE) {
-        int* cols = labelptr->getCols();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            cost[i] += 1.0 - 2.0 * out[i * dim + cols[j]];
-            /*
-             * explanation of above line: original codes are follows:
-             * cost[i] -= _square(out[i * dim + feature.col]);
-             * cost[i] += _square(1.0 - out[i * dim + feature.col]);
-             */
-          }
-        }
-      } else if (labelptr->getValueType() == FLOAT_VALUE) {
-        int* cols = labelptr->getCols();
-        real* values = labelptr->getValue();
-        for (size_t i = 0; i < numSamples; ++i) {
-          real sum1 = 0;
-          real sum2 = 0;
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            sum1 += values[j] * values[j];
-            sum2 += values[j] * out[i * dim + cols[j]];
-            /*
-             * explanation of above line: original codes are follows:
-             * cost[i] -= _square(out[i * dim + feature.col]);
-             * cost[i] += _square(value.col - out[i * dim + feature.col]);
-             */
-          }
-          cost[i] += sum1 - 2.0 * sum2;
-        }
-      } else {
-        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
-        return;
-      }
-      return;
-    } else {
-      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
-      return;
-    }
-  }
-
-  BaseMatrix::sumOfSquaredDiffs(output,
-                                label,
-                                /* scaleSum= */ 1,
-                                /* scaleDest= */ 1);
-}
-
-/* calculate the error of outputV according to label */
-void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  CHECK_EQ(label.getWidth(), dim);
-
-  real* out = output.getData();
-  real* grad = getData();
-
-  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
-  if (labelptr) {
-    // it is a CpuSparseMatrix
-    if (labelptr->getFormat() == SPARSE_CSR) {
-      // treat label as a SparseMatrix
-      for (size_t i = 0; i < numSamples; ++i) {
-        for (size_t j = 0; j < dim; ++j) {
-          grad[i * dim + j] += 2.0 * out[i * dim + j];
-        }
-      }
-      if (labelptr->getValueType() == NO_VALUE) {
-        int* cols = labelptr->getCols();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            grad[i * dim + cols[j]] -= 2.0;
-            /*
-             * explanation of above line: original codes are follows:
-             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
-             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
-             * - 1);
-             */
-          }
-        }
-      } else if (labelptr->getValueType() == FLOAT_VALUE) {
-        int* cols = labelptr->getCols();
-        real* values = labelptr->getValue();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            grad[i * dim + cols[j]] -= 2.0 * values[j];
-            /*
-             * explanation of above line: original codes are follows:
-             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
-             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
-             * - value.col);
-             */
-          }
-        }
-      } else {
-        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
-        return;
-      }
-      return;
-    } else {
-      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
-      return;
-    }
-  }
-
-  real* lbl = label.getData();
-  size_t ld = getStride();
-  size_t outLd = output.getStride();
-  size_t lblLd = label.getStride();
-  CHECK(lbl);
-  for (size_t i = 0; i < numSamples;
-       ++i, out += outLd, lbl += lblLd, grad += ld) {
-    for (size_t j = 0; j < dim; ++j) {
-      grad[j] += 2.0 * (out[j] - lbl[j]);  // positive gradient;
-    }
-  }
-}
-
-void CpuMatrix::smoothL1(Matrix& output, Matrix& label, real destScale) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* cost = getData();
-  real* out = output.getData();
-  real* lbl = label.getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, lbl += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      real absVal = std::fabs(out[j] - lbl[j]);
-      cost[i] *= destScale;
-      if (absVal < 1.0)
-        cost[i] += 0.5 * absVal * absVal;
-      else
-        cost[i] += absVal - 0.5;
-    }
-  }
-}
-
-void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label, real destScale) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), dim);
-
-  real* out = output.getData();
-  real* lbl = label.getData();
-  real* grad = getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim, lbl += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      real val = out[j] - lbl[j];
-      grad[j] *= destScale;
-      if (std::fabs(val) < 1) {
-        grad[j] += val;
-      } else {
-        grad[j] += (real(0) < val) - (val < real(0));
-      }
-    }
-  }
-}
-
-void CpuMatrix::tanh(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-  vTanh(numSamples * dim, getData(), output.getData());
-}
-
-void CpuMatrix::tanhDerivative(Matrix& output) {
-  BaseMatrix::tanhDerivative(output);
-}
-
-void CpuMatrix::softrelu(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  const real THRESHOLD = 40.0;
-  FORWARD_LOOP() {  // TODO(yuyang18): SIMD it?
-    for (size_t j = 0; j < dim; ++j) {
-      real x = in[j];
-      if (x > THRESHOLD) {
-        x = THRESHOLD;
-      } else if (x < -THRESHOLD) {
-        x = -THRESHOLD;
-      }
-      out[j] = x;
-    }
-  }
-  vExp(numSamples * dim, output.getData(), output.getData());
-  vLog1p(numSamples * dim, output.getData(), output.getData());
-}
-
-void CpuMatrix::softreluDerivative(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  size_t size = numSamples * dim;
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-  real* grad = getData();
-  MatrixPtr tmpMat = Matrix::create(numSamples, dim);
-  real* tmp = tmpMat->getData();
-
-  vExp(size, output.getData(), tmpMat->getData());
-
-  for (size_t i = 0; i < size; ++i) {
-    grad[i] *= (1.0 - 1.0 / tmp[i]);
-  }
-}
-
-void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-
-  const real* in = getData();
-  real* out = output.getData();
-
-  // out = p2*in
-  for (size_t i = 0; i < numSamples * dim; ++i) {
-    out[i] = p2 * in[i];
-  }
-
-  vTanh(numSamples * dim, out, out);
-
-  // out = p1 * out
-  for (size_t i = 0; i < numSamples * dim; ++i) {
-    out[i] = p1 * out[i];
-  }
-}
-
-/* uniform randomization, minimize precision = 1e-5 */
-void CpuMatrix::randomizeUniform() {
-  CHECK(isContiguous());
-  real* data = getData();
-  unsigned int* randSeed = ThreadLocalRand::getSeed();
-  real recipRandMax = 1.0f / (real)RAND_MAX;
-  for (size_t i = 0; i < elementCnt_; ++i) {
-    *data++ = rand_r(randSeed) * recipRandMax;
-  }
-}
-
-void CpuMatrix::print(std::ostream& os) const {
-  CHECK(isContiguous());
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      os << data_[i * width_ + j] << " ";
-    }
-    os << std::endl;
-  }
-}
-
-void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
-  real* input = data.getData();
-  real* w = W.getData();
-  real* output = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-
-  size_t partial_sum = numElements / paraSize;
-  if (paraSize == numElements) {
-    for (size_t n = 0; n < numSamples * numElements; ++n) {
-      output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements];
-    }
-    return;
-  }
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  for (size_t n = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < paraSize; i++) {
-      neon::prelu(
-          input + i * partial_sum, w[i], output + i * partial_sum, partial_sum);
-    }
-    input = input + numElements;
-    output = output + numElements;
-  }
-#else
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
-    }
-  }
-#endif
-}
-
-void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-  real* ograd = oGrad.getData();
-  real* input = data.getData();
-  real* wgrad = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = this->getHeight() * this->getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      wgrad[i / partial_sum] += ograd[k] * (input[k] > 0 ? 0 : input[k]);
-    }
-  }
-}
-
-void CpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-  real* diff = data_;
-  real* input = data.getData();
-  real* ograd = oGrad.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      diff[k] += ograd[k] * (input[k] > 0 ? 1 : w[i / partial_sum]);
-    }
-  }
-}
-
-void CpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
-  CHECK(isContiguous());
-  size_t h = height_ < height ? height_ : height;
-  size_t w = width_ < width ? width_ : width;
-  os.setf(std::ostream::scientific);
-  os << "[";
-  for (size_t i = 0; i < h; ++i) {
-    for (size_t j = 0; j < w; ++j) {
-      os << data_[i * width_ + j] << " ";
-    }
-    if (i == h - 1) {
-      os << "]";
-    }
-    os << std::endl;
-  }
-}
-
-void CpuMatrix::printOneRow(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, height_);
-  size_t offset = idx * stride_;
-  os << data_[offset];
-  for (size_t i = 1; i < width_; ++i) {
-    os << " " << data_[offset + i];
-  }
-  os << ";";
-}
-
-void CpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
-  CHECK(isContiguous());
-  CHECK(height_ == refMat.getHeight());
-  CHECK(width_ == refMat.getWidth());
-  CpuMatrix cpuRef(height_, width_);
-  cpuRef.copyFrom(refMat);
-  size_t diffCnt = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      real a = getElement(i, j);
-      real b = cpuRef.getElement(i, j);
-      if (fabs(a - b) > 0.00001) {
-        ++diffCnt;
-        if (printDiff) {
-          os << "ref= " << a << "  check= " << b << std::endl;
-        }
-      }
-    }
-  }
-  LOG(INFO) << "the  diffCnt is " << diffCnt;
-}
-
-real CpuMatrix::getMin() {
-  size_t size = getHeight() * getWidth();
-  real* data = getData();
-  real res = data[0];
-  for (size_t i = 1; i < size; ++i) {
-    if (res > data[i]) {
-      res = data[i];
-    }
-  }
-  return res;
-}
-
-real CpuMatrix::getMax() {
-  size_t size = getHeight() * getWidth();
-  real* data = getData();
-  real res = data[0];
-  for (size_t i = 1; i < size; ++i) {
-    if (res < data[i]) {
-      res = data[i];
-    }
-  }
-  return res;
-}
-
-void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) {
-  size_t height = this->getHeight();
-  size_t width0 = this->getWidth();
-  size_t width1 = in1.getWidth();
-
-  CHECK_EQ(height, in0.getHeight());
-  CHECK_EQ(width0, in0.getWidth());
-  CHECK_EQ(height, in1.getHeight());
-
-  CHECK_EQ(width1 % 2, 1U);
-
-  real* outV = this->getData();
-  real* inV0 = in0.getData();
-  real* inV1 = in1.getData();
-
-  int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height;
-       ++x, outV += width0, inV0 += width0, inV1 += width1) {
-    for (size_t i = 0; i < width0; ++i) {  // each dimension of output
-      for (size_t j = 0; j < width1; ++j) {
-        // iterate over all dimentions of inV1
-        int index = i + j - leftCtxLen;
-        index = (index + width0) % width0;
-        outV[i] += inV0[index] * inV1[j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::circularConvDerivative(
-    Matrix& outG, Matrix& in0, Matrix& in1, Matrix& inG0, Matrix& inG1) {
-  size_t height = in0.getHeight();
-  size_t width0 = in0.getWidth();
-  size_t width1 = in1.getWidth();
-
-  CHECK_EQ(height, in1.getHeight());
-  CHECK_EQ(height, inG0.getHeight());
-  CHECK_EQ(width0, inG0.getWidth());
-  CHECK_EQ(height, inG1.getHeight());
-  CHECK_EQ(width1, inG1.getWidth());
-  CHECK_EQ(height, outG.getHeight());
-  CHECK_EQ(width0, outG.getWidth());
-
-  real* outGV = outG.getData();
-  real* inV0 = in0.getData();
-  real* inV1 = in1.getData();
-  real* inGV0 = inG0.getData();
-  real* inGV1 = inG1.getData();
-
-  int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height; ++x,
-              outGV += width0,
-              inV0 += width0,
-              inV1 += width1,
-              inGV0 += width0,
-              inGV1 += width1) {
-    for (size_t j = 0; j < width1; ++j) {  // iterate over width1
-      for (size_t i = 0; i < width0; ++i) {
-        // such over all dimensions of outG
-        int index = i + j - leftCtxLen;
-        index = (index + width0) % width0;
-        inGV0[index] += outGV[i] * inV1[j];
-        inGV1[j] += outGV[i] * inV0[index];
-      }
-    }
-  }
-}
-
-void CpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* cost = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      CHECK(out[j] > 0 && out[j] < 1.0);
-      cost[i] -= std::log(1 - out[j]);
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      cost[i] -= std::log(out[cols[j]] / (1 - out[cols[j]]));
-    }
-  }
-}
-
-void CpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, output.getWidth());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* grad = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      CHECK(out[j] > 0 && out[j] < 1.0);
-      grad[j] += 1.0 / (1 - out[j]);
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      grad[cols[j]] -= 1.0 / (out[cols[j]] * (1 - out[cols[j]]));
-    }
-  }
-}
-
-/* calculate the classification error for multi binary label */
-void CpuMatrix::classificationErrorMulti(Matrix& output,
-                                         Matrix& label,
-                                         real threshold) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* result = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    real sum = 0.0;
-    for (size_t j = 0; j < dim; ++j) {
-      if (out[j] >= threshold) {
-        sum += 1.0;
-      }
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      if (out[cols[j]] < threshold) {
-        sum += 1.0;
-      } else {
-        sum -= 1.0;
-      }
-    }
-    result[i] = sum / dim;
-  }
-}
-
-void CpuMatrix::bilinearForward(const Matrix& in,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-  CHECK(dynamic_cast<const CpuMatrix*>(&in));
-
-  size_t outputW = getWidth();
-  size_t batchSize = getHeight();
-  size_t inputW = in.getWidth();
-  size_t inputH = in.getHeight();
-  size_t inPosOffset = inImgH * inImgW;
-  size_t outPosOffset = outImgH * outImgW;
-  (void)(inputH);
-
-  real* outData = getData();
-  const real* inData = in.getData();
-
-  if (inImgH == outImgH && inImgW == outImgW) {
-    this->copyFrom(in);
-  } else {
-    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
-      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
-        size_t h = ratioH * i;
-        size_t hid = (h < inImgH - 1) ? 1 : 0;
-        real h1lambda = ratioH * i - h;
-        real h2lambda = 1 - h1lambda;
-
-        for (size_t j = 0; j < outImgW; ++j) {
-          size_t w = ratioW * j;
-          size_t wid = (w < inImgW - 1) ? 1 : 0;
-          real w1lambda = ratioW * j - w;
-          real w2lambda = 1 - w1lambda;
-          // calculate four position for bilinear interpolation
-          const real* inPos = &inData[k * inputW + h * inImgW + w];
-          real* outPos = &outData[k * outputW + i * outImgW + j];
-          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
-            // bilinear interpolation
-            outPos[0] =
-                h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) +
-                h1lambda * (w2lambda * inPos[hid * inImgW] +
-                            w1lambda * inPos[hid * inImgW + wid]);
-            inPos += inPosOffset;
-            outPos += outPosOffset;
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::bilinearBackward(const Matrix& out,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW) {
-  CHECK(dynamic_cast<const CpuMatrix*>(&out));
-
-  size_t inputW = getWidth();
-  size_t inputH = getHeight();
-  size_t outputW = out.getWidth();
-  size_t batchSize = out.getHeight();
-  size_t inPosOffset = inImgH * inImgW;
-  size_t outPosOffset = outImgH * outImgW;
-  (void)(inputH);
-
-  real* inGrad = getData();
-  const real* outGrad = out.getData();
-
-  if (inImgH == outImgH && inImgW == outImgW) {
-    this->add(const_cast<Matrix&>(out));
-  } else {
-    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
-      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
-        size_t h = ratioH * i;
-        size_t hid = (h < inImgH - 1) ? 1 : 0;
-        real h1lambda = ratioH * i - h;
-        real h2lambda = 1 - h1lambda;
-        for (size_t j = 0; j < outImgW; ++j) {
-          size_t w = ratioW * j;
-          size_t wid = (w < inImgW - 1) ? 1 : 0;
-          real w1lambda = ratioW * j - w;
-          real w2lambda = 1 - w1lambda;
-
-          real* inPos = &inGrad[k * inputW + h * inImgW + w];
-          const real* outPos = &outGrad[k * outputW + i * outImgW + j];
-          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
-            inPos[0] += h2lambda * w2lambda * outPos[0];
-            inPos[wid] += h2lambda * w1lambda * outPos[0];
-            inPos[hid * inImgW] += h1lambda * w2lambda * outPos[0];
-            inPos[hid * inImgW + wid] += h1lambda * w1lambda * outPos[0];
-            inPos += inPosOffset;
-            outPos += outPosOffset;
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::vol2Col(real* data,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW) {
-  real* outData = getData();
-  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
-  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
-  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
-
-  int channelsCol = channels * filterD * filterH * filterW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % filterW;
-    int hOffset = (c / filterW) % filterH;
-    int dOffset = (c / filterW / filterH) % filterD;
-    int cIn = c / filterW / filterH / filterD;
-    for (int d = 0; d < outDepth; ++d) {
-      for (int h = 0; h < outHeight; ++h) {
-        for (int w = 0; w < outWidth; ++w) {
-          int dPad = d * strideD - paddingD + dOffset;
-          int hPad = h * strideH - paddingH + hOffset;
-          int wPad = w * strideW - paddingW + wOffset;
-
-          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
-              dPad >= 0 && dPad < depth)
-            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] =
-                data[((cIn * depth + dPad) * height + hPad) * width + wPad];
-          else
-            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] = 0;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::col2Vol(real* trg,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW,
-                        real alpha,
-                        real beta) {
-  real* src = getData();
-  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
-  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
-  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
-  int channelsCol = channels * filterD * filterH * filterW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % filterW;
-    int hOffset = (c / filterW) % filterH;
-    int dOffset = (c / filterW / filterH) % filterD;
-    int cIm = c / filterW / filterH / filterD;
-    for (int d = 0; d < outDepth; ++d) {
-      for (int h = 0; h < outHeight; ++h) {
-        for (int w = 0; w < outWidth; ++w) {
-          int dPad = d * strideD - paddingD + dOffset;
-          int hPad = h * strideH - paddingH + hOffset;
-          int wPad = w * strideW - paddingW + wOffset;
-          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
-              dPad >= 0 && dPad < depth)
-            trg[((cIm * depth + dPad) * height + hPad) * width + wPad] =
-                alpha *
-                    src[((c * outDepth + d) * outHeight + h) * outWidth + w] +
-                beta *
-                    trg[((cIm * depth + dPad) * height + hPad) * width + wPad];
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////
-//               functions executed via cpu                   //
-////////////////////////////////////////////////////////////////
-
-void GpuMatrix::selectElements(Matrix& table, IVector& ids) {
-  execViaCpu2(&CpuMatrix::selectElements, *this, table, ids);
-}
-}  // namespace paddle
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
deleted file mode 100644
index 4c3b2c95361065372f5969a2da73bce0eb9d123f..0000000000000000000000000000000000000000
--- a/paddle/math/Matrix.h
+++ /dev/null
@@ -1,2189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <memory>
-#include <thread>
-
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/ThreadLocal.h"
-
-#include <hl_gpu.h>
-
-#include "BaseMatrix.h"
-#include "MemoryHandle.h"
-#include "Vector.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/// TODO(tianbing), move to paddle/function/TensorType.h
-enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
-
-/**
- * @brief  matrix sparse_format .
- *
- * nnz represents nonzero number in sparse matrix.
- *
- * SPARSE_CSR: row major matrix. length of row is height_ + 1, each element
- * represents row start index in Matrix. length of col and value are nnz.
- *
- * SPARSE_CSC: col major matrix. length of col is width_ + 1, each element
- * represents col start index in Matrix. length of col and value are nnz.
- *
- * @code
- * for example: [0, 1, 0, 2, 0;
- *               1, 0, 0, 0, 0;
- *               0, 0, 0, 2, 5];
- * SPARSE_CSR row   [0, 2, 3, 5];
- *            col   [1, 3, 0, 3, 4];
- *            value [1, 2, 1, 2, 5]
- * SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
- *            row   [1, 0, 0, 2, 2];
- *            value [1, 1, 2, 2, 5]
- * @endcode
- */
-/// TODO(tianbing), move to paddle/function/TensorType.h
-enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
-
-class Matrix;
-class GpuMatrix;
-class CpuMatrix;
-class CpuSparseMatrix;
-class GpuSparseMatrix;
-typedef std::shared_ptr<Matrix> MatrixPtr;
-typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr;
-typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr;
-typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr;
-typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
-
-/**
- * Copy or assignemnt constructor will share the data as opposed to making a
- * copy of the original data. To make a copy of the orinal data, use copyFrom()
- * instead.
- */
-class Matrix : public BaseMatrix {
- protected:
-  Matrix(MemoryHandlePtr memHandle,
-         size_t height,
-         size_t width,
-         bool trans,
-         bool use_gpu);
-
-  Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
-
-  Matrix(real* data,
-         size_t height,
-         size_t width,
-         size_t stride,
-         bool trans,
-         bool use_gpu);
-
-  static ThreadLocal<MatrixPtr> tmpMat_;
-
- public:
-  size_t elementCnt_;  // maximal number of elements which can be held in data_
-  MemoryHandlePtr memoryHandle_;
-
- public:
-  virtual ~Matrix() {}
-
-  static MatrixPtr create(MemoryHandlePtr memHandle,
-                          size_t height,
-                          size_t width,
-                          bool trans = false);
-  static MatrixPtr create(size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          size_t stride,
-                          bool trans = false,
-                          bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      bool trans = false,
-                                      bool useGpu = false);
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      SparseFormat foramt = SPARSE_CSR,
-                                      bool trans = false,
-                                      bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(real* data,
-                                      int* row,
-                                      int* col,
-                                      size_t height,
-                                      size_t width,
-                                      size_t nnz, /* used to allocate space */
-                                      SparseValueType valueType, /*value type*/
-                                      SparseFormat format,
-                                      bool trans,
-                                      bool useGpu);
-
-  static void resizeOrCreateSparseMatrix(
-      MatrixPtr& matrix,
-      size_t height,
-      size_t width,
-      size_t nnz,
-      SparseValueType valueType = FLOAT_VALUE,
-      SparseFormat foramt = SPARSE_CSR,
-      bool trans = false,
-      bool useGpu = false);
-
-  static void resizeOrCreate(MatrixPtr& a,
-                             size_t height,
-                             size_t width,
-                             bool trans = false,
-                             bool useGpu = false);
-
-  /**
-   * @brief  set the data buffer used to hold the matrix data.
-   *
-   * caller should make sure that the size of data is at least
-   * sizeof(real)*height*width.
-   */
-  void setData(real* data) {
-    BaseMatrix::setData(data);
-    memoryHandle_.reset();
-  }
-
-  /// the data should be contiguous
-  void setData(real* data, size_t newHeight, size_t newWidth) {
-    setData(data);
-    height_ = newHeight;
-    width_ = newWidth;
-    elementCnt_ = newHeight * newWidth;
-    stride_ = width_;
-  }
-
-  size_t getWidth() const { return width_; }
-  size_t getHeight() const { return height_; }
-  size_t getStride() const { return stride_; }
-  size_t getElementCnt() const { return elementCnt_; }
-  virtual real* getData() { return data_; }
-  virtual const real* getData() const { return data_; }
-  bool isTransposed() const { return trans_; }
-  bool isContiguous() const { return stride_ == width_ || height_ == 1; }
-
-  // If sparse matrix, need to dynamic_cast to CpuSparseMatrix/GpuSparseMatrix
-  // befor call the following functions.
-  // Declare these functions in the base class just easy to call them.
-  // And these declarations should be moved to base class of sparse matrix
-  // if refactor sparse matrix
-  virtual int* getRows() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual int* getCols() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual SparseFormat getFormat() const {
-    LOG(FATAL) << "Not implemented";
-    return SPARSE_CSR;  //! suppress warning for no return value.
-  }
-
-  virtual SparseValueType getValueType() const {
-    LOG(FATAL) << "Not implemented";
-    return NO_VALUE;  //! suppress warning for no return value.
-  }
-
-  /**
-   * @brief matrix elment-wise add
-   *
-   * Named add3 just because add/add2 has been used in BaseMatrix.cu
-   * and they are not virtual function.
-   */
-  virtual void add3(MatrixPtr b) { LOG(FATAL) << "Not implemented"; }
-
-  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
-
-  virtual void zeroMem() { LOG(FATAL) << "Not implemented"; }
-
-  virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
-
-  void setDiag(real value);
-
-  virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void trimFrom(const CpuSparseMatrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  // For GpuMatrix this is an asynchronous copy interface
-  // For CpuMatrix this is an synchronous copy interface
-  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  MatrixPtr subMatrix(size_t startRow,
-                      size_t endRow,
-                      size_t startCol,
-                      size_t endCol);
-
-  MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
-    return subMatrix(startRow, endRow, 0, getWidth());
-  }
-
-  MatrixPtr subColMatrix(size_t startCol, size_t endCol) {
-    return subMatrix(0, getHeight(), startCol, endCol);
-  }
-
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
-    CHECK_LE(startRow + numRows, getHeight());
-    return Matrix::create(getData() + startRow * getWidth(),
-                          numRows,
-                          getWidth(),
-                          trans_,
-                          useGpu_);
-  }
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
-    CHECK_LE(startRow + numRows, getHeight());
-    CHECK_EQ(useGpu_, dest->useGpu_);
-    dest->setData(this->rowBuf(startRow), numRows, getWidth());
-    return dest;
-  }
-
-  /**
-   * If this is GpuMatrix, src is assumed to be CPU memory
-   *
-   * If this is CpuMatrix, src is assumed to be CPU memory
-   */
-  virtual void copyFrom(const real* src, size_t size) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void copyFrom(const real* src, const int64_t* seq) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief convert a int vector to a real matrix.
-   *
-   * (1) source and dest are both in CPU.
-   *
-   * (2) sizes are exactly match.
-   */
-  virtual void copyFrom(const IVector& src) {
-    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
-  }
-
-  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief Create a matrix with the same type (GpuMatrix, CpuMatrix,
-   *        NonValueSparseMatrix, etc.) as this.
-   *
-   * If height and width is zero, the new matrix will have the same size
-   * as this, otherwise the new matrix will have the specified size.
-   *
-   */
-  virtual MatrixPtr clone(size_t height = 0,
-                          size_t width = 0,
-                          bool useGpu = false) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real* getRowBuf(size_t row) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real getElement(size_t x, size_t y) const {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual real getSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void accumulateColSum(Matrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual real getAbsSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  /**
-   * @note Original data may not be preserved after resize().
-   */
-  virtual void resize(size_t newHeight, size_t newWidth) = 0;
-
-  /**
-   * @note This should only be used for sparse matrix.
-   */
-  virtual void resize(size_t newHeight,
-                      size_t newWidth,
-                      size_t newNnz, /* total item used to allocate space */
-                      SparseValueType valueType,
-                      SparseFormat format) = 0;
-
-  /**
-   * @brief This should only be used for sparse matrix.
-   *
-   * Currently must be called for each row in order.
-   * The matrix is not valid until setRow is called for the last row.
-   */
-  virtual void setRow(size_t row,
-                      size_t colNum,
-                      const unsigned int* cols,
-                      const real* values) = 0;
-
-  virtual MatrixPtr getTranspose() = 0;
-
-  /**
-   * @brief  hard transpose.
-   *
-   * allocate matTrans' memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void transpose(MatrixPtr& matTrans, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  rotate 90 degrees in clock-wise if clockWise=true;
-   *         otherwise rotate in anti clock-wise
-   * clock-wise:
-   * \f[
-   *   y(j,i) = x(M-i-1,j)
-   * \f]
-   * anti clock-wise:
-   * \f[
-   *   y(j,i) = x(i, N-1-j)
-   * \f]
-   * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
-   *
-   * allocate matRot' memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual MatrixPtr getInverse() {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  /**
-   * @brief  inverse.
-   *
-   * if allocate matInv's memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void inverse(MatrixPtr& matInv, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
- public:
-  /// Only set all variables to 0 or NULL but not free them.
-  virtual void clear() {
-    height_ = 0;
-    width_ = 0;
-    data_ = NULL;
-  }
-
-  void reshape(size_t height, size_t width);
-
-  /// add b to each sample of this.
-  virtual void addBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void addSharedBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void addBias(Matrix& b, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      addBias(b, scale);
-    } else {
-      addSharedBias(b, scale);
-    }
-  }
-
-  /// add each sample from a to this.
-  virtual void collectBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void collectSharedBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void collectBias(Matrix& a, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      collectBias(a, scale);
-    } else {
-      collectSharedBias(a, scale);
-    }
-  }
-
-  virtual void sequenceAvgForward(Matrix& a,
-                                  const IVector& startsPos,
-                                  int mode) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void sequenceAvgBackward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  virtual void mul(const Matrix& a,
-                   const Matrix& b,
-                   real scaleAB,
-                   real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// Add a vector (column) b to matrix a, column by column.
-  virtual void addColumnVector(const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += vec(index(i, j), 0)
-   * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
-   * @endcode
-   */
-  virtual void addByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   vec(index(i, j), 0) += this(i, j)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void addByBitCodeBackward(size_t numClasses,
-                                    const IVector& codes,
-                                    Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += <mat.row(index(i, j)), input.row(i)>
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& mat,
-                            const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   mat.row(index(i, j)) += this(i, j) * input.row(i)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardWeight(size_t numClasses,
-                                          const IVector& codes,
-                                          Matrix& mat,
-                                          const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   input.row(i) += this(i, j) * mat.row(index(i, j))
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardError(size_t numClasses,
-                                         const IVector& codes,
-                                         const Matrix& mat,
-                                         Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *   sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
-   * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
-   * @endcode
-   */
-  virtual void sumByBitCode(size_t numClasses,
-                            IVector& codes,
-                            Matrix& sum,
-                            real scaleSum) {
-    (void)numClasses;
-    (void)codes;
-    (void)sum;
-    (void)scaleSum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *  this(i, j) -= bit(i, j)
-   * where bit(i, j) is same as that for sumByBitCode
-   * @endcode
-   */
-  virtual void subByBitCode(size_t numClasses_, IVector& codes) {
-    (void)numClasses_;
-    (void)codes;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * add the sum of each row of this to mat
-   */
-  virtual void rowSum(Matrix& sum) {
-    (void)sum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each row of this to mat
-   */
-  virtual void rowMax(Matrix& max) {
-    (void)max;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each column of this to mat
-   */
-  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each column of this matrix.
-   *
-   * The row ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutForward(Matrix& a,
-                             IVector& id,
-                             size_t channels,
-                             size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutBackward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each row of this matrix.
-   *
-   * The column ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void rowMax(IVector& maxIds, Matrix& max) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// normalize each row so that the sum of each row is 1.
-  virtual void rowNormalizeL1(Matrix& out) {
-    (void)out;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   *  this = a*b
-   * @endcode
-   */
-  virtual void mul(const Matrix& a, const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = a*this)
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a) { LOG(FATAL) << "Not implemented"; }
-
-  /// merge the element for each col.
-  virtual void colMerge(Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropy(Matrix& output, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                              IVector& label,
-                                              real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                IVector& label,
-                                                real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * \f[
-   *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
-   * \f]
-   *
-   * b contains M elements,
-   * c contains N elements (N is odd),
-   * b's index arithmetic is computed modulo M,
-   * c's index arithmetic is computed modulo N.
-   */
-  virtual void circularConv(Matrix& b, Matrix& c) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void circularConvDerivative(Matrix& output,
-                                      Matrix& prevOut1,
-                                      Matrix& prevOut2,
-                                      Matrix& prevGrad1,
-                                      Matrix& prevGrad2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
-  virtual void softmax(Matrix& output) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-  virtual void sequenceSoftmax(Matrix& output, const IVector& index) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void softmaxBackward(Matrix& outputV) {
-    (void)outputV;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /*
-    sum_i = sum_j this_ij * output_ij
-    this_ij = output_ij* (this_ij - sum_i)
-  */
-  virtual void softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the sum of squares diff cost.
-  virtual void sumOfSquares(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// gradient of sumOfSquares.
-  virtual void sumOfSquaresBp(Matrix& outputV, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void smoothL1(Matrix& output, Matrix& label, real destScale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void smoothL1Bp(Matrix& outputV, Matrix& label, real destScale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void tanhDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void softrelu(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void softreluDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void scaledTanh(Matrix& output, real p1, real p2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print out the values of elements to os
-  virtual void print(std::ostream& os) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * print a part of the matrix
-   * from the (top,left) value to the (height, width) value (not included)
-   */
-  virtual void print(std::ostream& os, size_t height, size_t width) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print one row to os
-  virtual void printOneRow(std::ostream& os, size_t idx) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void check(std::ostream& os, Matrix& refMat, bool printDiff = true) {}
-
-  virtual real getMin() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-  virtual real getMax() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void randomizeUniform() { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief  calulate the error of classification
-   *
-   * output[i] = 1 if row i is an error.
-   *
-   * output[i] = 0 if row i is correct.
-   *
-   */
-  virtual void classificationError(Matrix& output,
-                                   IVector& label,
-                                   size_t topkSize = 1) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void upsampleForward(Matrix& input,
-                               Matrix& mask,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t outputH,
-                               size_t outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void upsampleBackward(Matrix& outputGrad,
-                                Matrix& mask,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Pooling forward operation, pick out the largest element
-   * in the sizeX of value, if the maskMatP is not NULL, it will
-   * also caculate the location indices.
-   */
-  virtual void maxPoolForward(Matrix& inputMat,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW,
-                              MatrixPtr maskMatP = NULL) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling backward operation.
-  virtual void maxPoolBackward(Matrix& image,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               Matrix& outGrad,
-                               Matrix& outV,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling forward operation, caculate the average of sizeX elements.
-  virtual void avgPoolForward(Matrix& input,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW,
-                              bool excludeMode = true) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPoolBackward(Matrix& input,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode = true) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Pooling 3D forward operation, pick out the largest element
-   * in the sizeX of value
-   */
-  virtual void maxPool3DForward(Matrix& inputMat,
-                                Matrix& maxPoolIdx,
-                                size_t channels,
-                                size_t imgSizeD,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t outputD,
-                                size_t outputH,
-                                size_t outputW,
-                                size_t sizeZ,
-                                size_t sizeY,
-                                size_t sizeX,
-                                size_t strideD,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t paddingD,
-                                size_t paddingH,
-                                size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void maxPool3DBackward(Matrix& outGrad,
-                                 Matrix& maxPoolIdx,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW,
-                                 real scaleTargets,
-                                 real scaleOutput) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPool3DForward(Matrix& input,
-                                size_t channels,
-                                size_t imgSizeD,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t outputD,
-                                size_t outputH,
-                                size_t outputW,
-                                size_t sizeZ,
-                                size_t sizeY,
-                                size_t sizeX,
-                                size_t strideD,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t paddingD,
-                                size_t paddingH,
-                                size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPool3DBackward(Matrix& input,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW,
-                                 real scaleTargets,
-                                 real scaleOutput) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
- * Input: one or more sequences. Each sequence contains some instances.
- *
- * Output: output size is the number of input sequences (NOT input
- * instances).
- *
- * output[i] is set to max_input[i].
- */
-  virtual void maxSequenceForward(Matrix& input,
-                                  const IVector& sequence,
-                                  IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void maxSequenceBackward(Matrix& outputGrad,
-                                   const IVector& sequence,
-                                   IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-  /**
-   * @brief  cross entropy for multi binary labels
-   *
-   * @code
-   * this[i] = -sum(label[i][j]*log(output[i][j])
-   *           + (1-label[i][j])*log(1-output[i][j]))
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  The gradient of cross entropy for multi binary labels on output
-   *
-   * @code
-   * this[i][j] = -label[i][j]/output[i][j]
-   *              + (1-label[i][j])/(1-output[i][j])
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  Calculate the classification error for multi binary labels
-   *
-   * @code
-   * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
-   *            || (output[i][j] < threshold && label[i][j] == 1))
-   *            / output->getWidth()
-   * @endcode
-   */
-  virtual void classificationErrorMulti(Matrix& output,
-                                        Matrix& label,
-                                        real threshold) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void paramReluForward(Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void vol2Col(real* data,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void col2Vol(real* trg,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW,
-                       real alpha,
-                       real beta) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void bilinearForward(const Matrix& in,
-                               const size_t inImgH,
-                               const size_t inImgW,
-                               const size_t outImgH,
-                               const size_t outImgW,
-                               const size_t numChannels,
-                               const real ratioH,
-                               const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void bilinearBackward(const Matrix& out,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (useGpu_) {
-      TensorGpuApply<real>(*this, expr);
-    } else {
-      TensorCpuApply<real>(*this, expr);
-    }
-  }
-
-  bool isEmpty() const { return data_ == nullptr; }
-
-  explicit operator bool() const { return !isEmpty(); }
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
-  mat.print(os);
-  return os;
-}
-
-class GpuMatrix : public Matrix {
- public:
-  GpuMatrix();
-
-  GpuMatrix(size_t height, size_t width, bool trans = false);
-  GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, true) {}
-  GpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, true) {}
-  GpuMatrix(GpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, true) {}
-  ~GpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  /**
-   * Copy the data from cpu_memory buffer
-   */
-  void copyFrom(const real* hostSrc, size_t size);
-
-  void copyFrom(const real* hostSrc, const int64_t* seq);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const IVector& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  real getElement(size_t x, size_t y) const;
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  real getMin();
-  real getMax();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr& matInv, bool memAlloc);
-
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /**
-   * @code
-   * add each sample from a to this.
-   * @endcode
-   */
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*b
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b);
-
-  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
-
-  void mul(const GpuSparseMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  void mul(const GpuMatrix& a,
-           const GpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  void rightMul(Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*this
-   * @endcode
-   */
-  void leftMul(Matrix& a);
-
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& max);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& max);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxBackward(Matrix& outputV);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  virtual void print(std::ostream& os) const;
-  virtual void print(std::ostream& os, size_t height, size_t width) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
-
-  void upsampleForward(Matrix& input,
-                       Matrix& mask,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t channels,
-                       size_t outputH,
-                       size_t outputW);
-
-  void upsampleBackward(Matrix& outputGrad,
-                        Matrix& mask,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t channels,
-                        size_t outputH,
-                        size_t outputW);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      MatrixPtr maskMatP);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      bool excludeMode = true);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW,
-                       bool excludeMode = true);
-
-  void maxPool3DForward(Matrix& inputMat,
-                        Matrix& maxPoolIdx,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void maxPool3DBackward(Matrix& outGrad,
-                         Matrix& maxPoolIdx,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void avgPool3DForward(Matrix& input,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void avgPool3DBackward(Matrix& input,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  void vol2Col(real* data,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW);
-
-  void col2Vol(real* trg,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW,
-               real alpha,
-               real beta);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorGpuApply<real>(*this, expr);
-  }
-};
-
-class CpuMatrix : public Matrix {
- private:
-  MatrixPtr sftmaxSum_;
-  MatrixPtr sftmaxDot_;
-
- public:
-  CpuMatrix(size_t height, size_t width, bool trans = false);
-  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, false) {}
-  CpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, false) {}
-
-  CpuMatrix(CpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, false) {}
-
-  ~CpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  real getElement(size_t x, size_t y) const;
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr& matInv, bool memAlloc);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const real* cpuSrc, size_t size);
-
-  void copyFrom(const real* cpuSrc, const int64_t* seq);
-
-  void copyFrom(const IVector& src);
-
-  void copyFrom(CpuSparseMatrix& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  void upsampleForward(Matrix& input,
-                       Matrix& mask,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t channels,
-                       size_t outputH,
-                       size_t outputW);
-
-  void upsampleBackward(Matrix& outputGrad,
-                        Matrix& mask,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t channels,
-                        size_t outputH,
-                        size_t outputW);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      MatrixPtr maskMatP);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      bool excludeMode = true);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW,
-                       bool excludeMode = true);
-
-  void maxPool3DForward(Matrix& inputMat,
-                        Matrix& maxPoolIdx,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void maxPool3DBackward(Matrix& outGrad,
-                         Matrix& maxPoolIdx,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void avgPool3DForward(Matrix& input,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void avgPool3DBackward(Matrix& input,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
- public:
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /// add each sample of a to this.
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids);
-
-  /**
-   * use abstract getRow() to get row from table.
-   *
-   * Define table as template instead of virtual class for performance sake.
-   * internal used by above two virtual funcs.
-   */
-  template <typename TableMatType>
-  void selectRowsImp(TableMatType& table, IVector& ids);
-  template <typename TableMatType>
-  void addToRowsImp(TableMatType& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-  void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
-
-  static void mul(CpuMatrix* a,
-                  CpuMatrix* b,
-                  CpuSparseMatrix* c,
-                  real scaleAB,
-                  real scaleT);
-
-  /**
-   * c = a * b
-   *
-   * use abstract getRow() to get row from B,C.
-   * Define B,C as template instead of virtual class for performance sake.
-   */
-  template <typename MatBType, typename MatCType>
-  static void mul(
-      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(const Matrix& a, const Matrix& b);
-
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-  void rightMul(Matrix& b);
-
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-  void leftMul(Matrix& a);
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMaxId(IVector& maxIds);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& maxVal);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& maxVal);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void rowNormalizeL1(Matrix& out);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void circularConv(Matrix& b, Matrix& c);
-  void circularConvDerivative(Matrix& output,
-                              Matrix& prevOut1,
-                              Matrix& prevOut2,
-                              Matrix& prevGrad1,
-                              Matrix& prevGrad2);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-
-  void smoothL1(Matrix& output, Matrix& label, real destScale);
-  void smoothL1Bp(Matrix& output, Matrix& label, real destScale);
-
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  void print(std::ostream& os) const;
-  void print(std::ostream& os, size_t height, size_t width) const;
-  void printOneRow(std::ostream& os, size_t idx) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-
-  real getMin();
-  real getMax();
-
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
-
-  void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
-
-  void addByBitCodeBackward(size_t numClasses,
-                            const IVector& codes,
-                            Matrix& vec);
-
-  void mulByBitCode(size_t numClasses,
-                    const IVector& codes,
-                    const Matrix& mat,
-                    const Matrix& input);
-
-  void mulByBitCodeBackwardWeight(size_t numClasses,
-                                  const IVector& codes,
-                                  Matrix& mat,
-                                  const Matrix& input);
-
-  void mulByBitCodeBackwardError(size_t numClasses,
-                                 const IVector& codes,
-                                 const Matrix& mat,
-                                 Matrix& input);
-
-  void sumByBitCode(size_t numClasses,
-                    IVector& codes,
-                    Matrix& sum,
-                    real scaleSum);
-
-  void subByBitCode(size_t numClasses_, IVector& codes);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-  void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  void vol2Col(real* data,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW);
-
-  void col2Vol(real* trg,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW,
-               real alpha,
-               real beta);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorCpuApply<real>(*this, expr);
-  }
-};
-
-class SharedCpuMatrix : public CpuMatrix {
- public:
-#ifndef PADDLE_MOBILE_INFERENCE
-  /* blockNum is number of partitions of the matrix  */
-  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(height, width, trans) {
-    initShared(blockNum);
-  }
-  SharedCpuMatrix(
-      int blockNum, real* data, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(data, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(int blockNum,
-                  CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initBlock(1);
-  }
-
-  ~SharedCpuMatrix() {}
-
- public:
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-  virtual void add(Matrix& b, real p1, real p2);
-  virtual void add(real p1, real p2);
-
- private:
-  using Matrix::mul;
-  void initShared(int blockNum);
-  void initBlock(int blockNum);
-
-  int blockNum_;
-  std::vector<std::unique_ptr<std::mutex>> blockLocks_;
-  ThreadLocal<CpuMatrixPtr> localBuf_;
-  ThreadLocal<std::vector<int>> localBufRows_;
-  ThreadLocal<std::vector<int>> blockSeq_;
-#endif
-};
-
-typedef struct { unsigned int col; } sparse_non_value_t;
-
-typedef struct {
-  unsigned int col;
-  float value;
-} sparse_float_value_t;
-
-}  // namespace paddle
-#include "ExecViaCpu.h"
diff --git a/paddle/math/tests/OriginalOptimizerApi.h b/paddle/math/tests/OriginalOptimizerApi.h
deleted file mode 100644
index e30d784b232dd7d477877d3f7c90cd185357328c..0000000000000000000000000000000000000000
--- a/paddle/math/tests/OriginalOptimizerApi.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/Vector.h"
-#include "paddle/utils/GlobalConstants.h"
-
-using namespace paddle;  // NOLINT
-
-void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
-                                      real alpha,
-                                      real beta,
-                                      real gamma,
-                                      real tau,
-                                      real learningRate) {
-  vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
-                                   -alpha * gamma * learningRate);
-  vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
-                                   tau * alpha * gamma * learningRate);
-  vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
-                             tau / beta + 1.0 / alpha,
-                             *vecs[PARAMETER_MOMENTUM_VT],
-                             1.0 / beta);
-}
-
-void AdagradParameterOptimizer(const VectorPtr vecs[],
-                               real epsilon,
-                               real learningRate,
-                               real momentum,
-                               real decayRate) {
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
-                                                1.0f);
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
-                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
-  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
-                                real rou,
-                                real epsilon,
-                                real learningRate,
-                                real momentum,
-                                real decayRate) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
-
-  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
-  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
-                                        epsilon,
-                                        epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->sqrt2();
-
-  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
-      *vecs[PARAMETER_GRADIENT],
-      *vecs[PARAMETER_LEARNING_RATE],
-      rou,
-      1.0f - rou);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void RMSPropParameterOptimizer(const VectorPtr vecs[],
-                               real accumulatedRou,
-                               real rou,
-                               real epsilon,
-                               real learningRate,
-                               real momentum,
-                               real decayRate,
-                               bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
-
-  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
-  // Basiclly if the sign of the gradient changes more often,
-  // the learning rate will be decreased.
-  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
-  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-                                           -1.0f);
-  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
-                                      real accumulatedRou,
-                                      real rou,
-                                      real epsilon,
-                                      real learningRate,
-                                      real momentum,
-                                      real decayRate,
-                                      bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
-  // Basiclly if the bigger the magnitude gradient is,
-  // the smaller the learning rate will be.
-  vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void AdamParameterOptimizer(const VectorPtr vecs[],
-                            real beta1,
-                            real beta2,
-                            real beta1_power,
-                            real beta2_power,
-                            real epsilon,
-                            real learningRate) {
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1, 1 - beta1);
-
-  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
-  g->square2();
-  v->add(*g, beta2, 1 - beta2);
-
-  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
-  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
-  g->sqrt2(*v);
-  g->dotDiv(*m, *g, 0., epsilon);
-  real alpha =
-      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
-  theta->add(*theta, 1.0, *g, -alpha);
-}
-
-void AdamaxParameterOptimizer(
-    const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1, 1 - beta1);
-
-  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
-  u->mulScalar(beta2);
-  g->abs2();
-  u->max2(*u, *g);
-
-  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
-  g->dotDiv(*m, *u);
-  real learningRate = alpha / (1 - std::pow(beta1, step));
-  theta->add(*theta, 1.0, *g, -learningRate);
-}
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
deleted file mode 100644
index 40ac04ef5d4baa0239bb03b04c3a6cce0fcac5a5..0000000000000000000000000000000000000000
--- a/paddle/math/tests/TensorCheck.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-/**
- * This file provides a TensorCheck template function, which can be used to
- * compare CpuMatrix and GpuMatrix, CpuVector and GpuVector, and so on.
- */
-
-#include <cmath>
-#include "paddle/math/Matrix.h"
-
-namespace autotest {
-
-using paddle::Matrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::VectorT;
-using paddle::CpuVectorT;
-using paddle::GpuVectorT;
-
-class AssertEqual {
- public:
-  AssertEqual(real err = 0) : err_(err) {}
-
-  inline bool operator()(real a, real b) {
-    if (err_ == 0) {
-      if (a != b) {
-        return false;
-      }
-    } else {
-      if (std::fabs(a - b) > err_) {
-        if ((std::fabs(a - b) / std::fabs(a)) > (err_ / 10.0f)) {
-          return false;
-        }
-      }
-    }
-
-    return true;
-  }
-
- private:
-  real err_;
-};
-
-template <typename Tensor>
-class CopyToCpu;
-
-template <>
-class CopyToCpu<CpuMatrix> {
- public:
-  explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
-  const CpuMatrix& copiedArg() const { return arg_; }
-
- private:
-  const CpuMatrix& arg_;
-};
-
-template <>
-class CopyToCpu<GpuMatrix> {
- public:
-  explicit CopyToCpu(const GpuMatrix& arg)
-      : arg_(arg.getHeight(), arg.getWidth()) {
-    arg_.copyFrom(arg);
-  }
-  CpuMatrix& copiedArg() { return arg_; }
-
- private:
-  CpuMatrix arg_;
-};
-
-template <>
-class CopyToCpu<Matrix> {
- public:
-  explicit CopyToCpu(const Matrix& arg)
-      : arg_(arg.getHeight(), arg.getWidth()) {
-    arg_.copyFrom(arg);
-  }
-  CpuMatrix& copiedArg() { return arg_; }
-
- private:
-  CpuMatrix arg_;
-};
-
-template <typename T>
-class CopyToCpu<CpuVectorT<T>> {
- public:
-  explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
-  const CpuVectorT<T>& copiedArg() const { return arg_; }
-
- private:
-  const CpuVectorT<T>& arg_;
-};
-
-template <typename T>
-class CopyToCpu<GpuVectorT<T>> {
- public:
-  explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
-    arg_.copyFrom(arg);
-  }
-  CpuVectorT<T>& copiedArg() { return arg_; }
-
- private:
-  CpuVectorT<T> arg_;
-};
-
-template <typename T>
-class CopyToCpu<VectorT<T>> {
- public:
-  explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
-    arg_.copyFrom(arg);
-  }
-  CpuVectorT<T>& copiedArg() { return arg_; }
-
- private:
-  CpuVectorT<T> arg_;
-};
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare,
-                 const CpuMatrix& matrix1,
-                 const CpuMatrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (!compare(a, b)) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-template <typename AssertEq, class T>
-void TensorCheck(AssertEq compare,
-                 const CpuVectorT<T>& vector1,
-                 const CpuVectorT<T>& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-
-  const T* data1 = vector1.getData();
-  const T* data2 = vector2.getData();
-  size_t size = vector1.getSize();
-  int count = 0;
-  for (size_t i = 0; i < size; i++) {
-    real a = data1[i];
-    real b = data2[i];
-    if (!compare(a, b)) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
-}
-
-template <typename AssertEq, typename Tensor1, typename Tensor2>
-void TensorCheck(AssertEq compare,
-                 const Tensor1& tensor1,
-                 const Tensor2& tensor2) {
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare, real args1, real args2) {
-  EXPECT_EQ(compare(args1, args2), true) << "[Test error] args1 = " << args1
-                                         << ", args2 = " << args2;
-}
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare, size_t args1, size_t args2) {
-  EXPECT_EQ(args1, args2) << "[Test error] args1 = " << args1
-                          << ", args2 = " << args2;
-}
-
-template <typename Tensor1, typename Tensor2>
-void TensorCheckEqual(const Tensor1& tensor1, const Tensor2& tensor2) {
-  AssertEqual compare(0);
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-template <typename Tensor1, typename Tensor2>
-void TensorCheckErr(const Tensor1& tensor1, const Tensor2& tensor2) {
-#ifndef PADDLE_TYPE_DOUBLE
-  AssertEqual compare(1e-3);
-#else
-  AssertEqual compare(1e-10);
-#endif
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-}  // namespace autotest
diff --git a/paddle/math/tests/TestUtils.h b/paddle/math/tests/TestUtils.h
deleted file mode 100644
index e1966ec8a74747960420ec80fdfbb957f7cf177f..0000000000000000000000000000000000000000
--- a/paddle/math/tests/TestUtils.h
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-/**
- * This file provides a AutoCompare calss to simplify the comparison
- * of CPU and GPU member functions.
- *
- * This takes two steps
- * 1. Construct an AutoCompare object.
- *    When constructing an AutoCompare object, you can set the err argument
- * to specify the maximum error for CPU and GPU functions.
- *
- * 2. Use the template functions cmpWithArg or cmpWithoutArg.
- * A. [cmpWithArg] Requires the caller construct the cpu arguments.
- *
- *  AutoCompare test;
- *  Init Argument arg1,arg2...
- *  test.cmpWithArg(function, arg1, arg2....)
- *
- * B. [cmpWithoutArg] The caller do not need construct arguments.
- *    If matrix used in these functions arguments is the same size.
- *    Such as the element wise function and the aggregate function
- *    defined in the BaseMatrix.cpp.
- *
- *  AutoCompare test;
- *  test.cmpWithoutArg<I...>(function, height, width)
- */
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-
-namespace autotest {
-
-using paddle::BaseMatrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::CpuIVector;
-using paddle::GpuIVector;
-using paddle::CpuSparseMatrix;
-using paddle::GpuSparseMatrix;
-
-template <typename T1, typename T2>
-class ReplaceType {
- public:
-  typedef T1 type;
-};
-
-template <>
-class ReplaceType<BaseMatrix, CpuMatrix> {
- public:
-  typedef CpuMatrix type;
-};
-
-template <>
-class ReplaceType<BaseMatrix, GpuMatrix> {
- public:
-  typedef GpuMatrix type;
-};
-
-template <>
-class ReplaceType<Matrix, CpuMatrix> {
- public:
-  typedef CpuMatrix type;
-};
-
-template <>
-class ReplaceType<Matrix, GpuMatrix> {
- public:
-  typedef GpuMatrix type;
-};
-
-// construct a argument
-template <typename T>
-T construct(int height, int width);
-
-template <>
-float construct(int height, int width) {
-  return 0.5;
-}
-
-template <>
-double construct(int height, int width) {
-  return 0.5;
-}
-
-template <>
-size_t construct(int height, int width) {
-  size_t offset = std::rand() % (height < width ? height : width);
-  return offset;
-}
-
-template <>
-CpuMatrix construct(int height, int width) {
-  CpuMatrix a(height, width);
-  return a;
-}
-
-template <>
-GpuMatrix construct(int height, int width) {
-  GpuMatrix a(height, width);
-  return a;
-}
-
-// init a argument
-template <typename T>
-void init(T& v) {
-  return;
-}
-
-template <>
-void init(CpuMatrix& v) {
-  v.randomizeUniform();
-}
-
-template <>
-void init(GpuMatrix& v) {
-  v.randomizeUniform();
-}
-
-// init a tuple which contains a set of arguments.
-template <std::size_t I = 0, typename... Args>
-inline typename std::enable_if<I == sizeof...(Args), void>::type initTuple(
-    std::tuple<Args...>& t) {}
-
-template <std::size_t I = 0, typename... Args>
-    inline typename std::enable_if <
-    I<sizeof...(Args), void>::type initTuple(std::tuple<Args...>& t) {
-  init(std::get<I>(t));
-  initTuple<I + 1>(t);
-}
-
-// copy a argument, copy src to dest
-template <typename T1, typename T2>
-void copy(T1& dest, T2& src) {
-  dest = src;
-}
-
-template <>
-void copy(GpuMatrix& dest, CpuMatrix& src) {
-  dest.copyFrom(src);
-}
-
-// copy a tuple, copy src to dest
-template <std::size_t I = 0, typename... Args1, typename... Args2>
-inline typename std::enable_if<I == sizeof...(Args1), void>::type copyTuple(
-    std::tuple<Args1...>& dest, std::tuple<Args2...>& src) {}
-
-template <std::size_t I = 0, typename... Args1, typename... Args2>
-    inline typename std::enable_if <
-    I<sizeof...(Args1), void>::type copyTuple(std::tuple<Args1...>& dest,
-                                              std::tuple<Args2...>& src) {
-  copy(std::get<I>(dest), std::get<I>(src));
-  copyTuple<I + 1>(dest, src);
-}
-
-// call member function
-template <typename C,
-          typename FC,
-          typename R,
-          typename... FArgs,
-          typename... Args>
-R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
-  return (obj.*f)(args...);
-}
-
-template <typename T>
-class ReturnType {
- public:
-  typedef T type;
-};
-
-template <>
-class ReturnType<CpuMatrix> {
- public:
-  typedef GpuMatrix type;
-};
-
-template <>
-class ReturnType<CpuIVector> {
- public:
-  typedef GpuIVector type;
-};
-
-template <>
-class ReturnType<CpuSparseMatrix> {
- public:
-  typedef GpuSparseMatrix type;
-};
-
-template <typename T>
-typename ReturnType<T>::type autoArgs(T& v) {
-  return v;
-}
-
-template <>
-GpuMatrix autoArgs(CpuMatrix& v) {
-  GpuMatrix a(v.getHeight(), v.getWidth());
-  a.copyFrom(v);
-  return a;
-}
-
-template <>
-GpuIVector autoArgs(CpuIVector& v) {
-  GpuIVector a(v.getSize());
-  a.copyFrom(v);
-  return a;
-}
-
-template <>
-GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
-  GpuSparseMatrix a(v.getHeight(),
-                    v.getWidth(),
-                    v.getElementCnt(),
-                    v.getValueType(),
-                    v.getFormat());
-  a.copyFrom(v, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  return a;
-}
-
-class AutoCompare {
- public:
-  /**
-   * err is the allowed calculation error.
-   * The smaller the value of err,
-   * the stricter the comparison is between CPU and GPU calculations.
-   */
-  AutoCompare(size_t height, size_t width, real err = 1e-3)
-      : cpu(height, width), gpu(height, width), compare(err) {
-    init(cpu);
-    copy(gpu, cpu);
-  }
-
-  template <typename C, typename R, typename... FArgs, typename... Args>
-  void cmpWithArg(R (C::*f)(FArgs...), Args&&... args) {
-    static_assert(sizeof...(FArgs) == sizeof...(Args),
-                  "size of parameter packs are not equal");
-    call(cpu, f, args...);
-    call(gpu, f, autoArgs(args)...);
-
-    TensorCheck(compare, cpu, gpu);
-  }
-
-  template <std::size_t... I, typename C, typename R, typename... Args>
-  void cmpWithoutArg(R (C::*f)(Args...), size_t height, size_t width) {
-    static_assert(sizeof...(I) == sizeof...(Args),
-                  "size of parameter packs are not equal");
-    (void)height;
-    (void)width;
-    auto tuple1 = std::make_tuple(
-        construct<typename ReplaceType<
-            typename std::decay<
-                typename std::tuple_element<I,
-                                            std::tuple<Args...>>::type>::type,
-            CpuMatrix>::type>(height, width)...);
-
-    auto tuple2 = std::make_tuple(
-        construct<typename ReplaceType<
-            typename std::decay<
-                typename std::tuple_element<I,
-                                            std::tuple<Args...>>::type>::type,
-            GpuMatrix>::type>(height, width)...);
-
-    initTuple(tuple1);
-    copyTuple(tuple2, tuple1);
-
-    call(cpu, f, std::get<I>(tuple1)...);
-    call(gpu, f, std::get<I>(tuple2)...);
-
-    TensorCheck(compare, cpu, gpu);
-  }
-
- protected:
-  CpuMatrix cpu;
-  GpuMatrix gpu;
-  AssertEqual compare;
-};
-
-}  // namespace autotest
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
deleted file mode 100644
index 84bc1c1d9e0a8368a69c1e53a63056eb45b9239f..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_Allocator.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-#define private public
-#include "paddle/math/Allocator.h"
-#include "paddle/math/MemoryHandle.h"
-#include "paddle/math/PoolAllocator.h"
-
-using namespace paddle;  // NOLINT
-
-template <typename Allocator>
-void testPoolAllocator() {
-  PoolAllocator* pool =
-      new PoolAllocator(new Allocator(), /* sizeLimit */ 1024);
-
-  /* alloc from system memory */
-  void* ptr1 = pool->alloc(10);
-  void* ptr2 = pool->alloc(200);
-  void* ptr3 = pool->alloc(200);
-  pool->free(ptr1, 10);
-  pool->free(ptr2, 200);
-  pool->free(ptr3, 200);
-  pool->printAll();
-  EXPECT_EQ((size_t)2, pool->pool_.size());
-  EXPECT_EQ((size_t)1, pool->pool_[10].size());
-  EXPECT_EQ((size_t)2, pool->pool_[200].size());
-  EXPECT_EQ(ptr1, pool->pool_[10][0]);
-  EXPECT_EQ(ptr2, pool->pool_[200][0]);
-  EXPECT_EQ(ptr3, pool->pool_[200][1]);
-
-  /* alloc from pool */
-  void* ptr4 = pool->alloc(10);
-  void* ptr5 = pool->alloc(200);
-  pool->printAll();
-  EXPECT_EQ((size_t)0, pool->pool_[10].size());
-  EXPECT_EQ((size_t)1, pool->pool_[200].size());
-  EXPECT_EQ(ptr1, ptr4);
-  EXPECT_EQ(ptr3, ptr5);
-  pool->free(ptr4, 10);
-  pool->free(ptr5, 200);
-
-  /* alloc size > sizeLimit */
-  void* ptr6 = pool->alloc(1024);
-  pool->free(ptr6, 1024);
-  EXPECT_LE((size_t)1024, pool->poolMemorySize_);
-
-  void* ptr7 = pool->alloc(1);
-  EXPECT_EQ((size_t)0, pool->poolMemorySize_);
-  EXPECT_EQ((size_t)0, pool->pool_.size());
-  pool->free(ptr7, 1);
-
-  delete pool;
-}
-
-TEST(Allocator, Pool) {
-  testPoolAllocator<CpuAllocator>();
-#ifdef PADDLE_WITH_CUDA
-  testPoolAllocator<GpuAllocator>();
-#endif
-}
-
-TEST(MemoryHandle, Cpu) {
-  for (auto size : {10, 30, 50, 100, 200, 512, 1000, 1023, 1024, 1025, 8193}) {
-    CpuMemoryHandle handle(size);
-    EXPECT_LE(handle.getSize(), handle.getAllocSize());
-  }
-
-  void* ptr1;
-  void* ptr2;
-  {
-    CpuMemoryHandle handle(256);
-    ptr1 = handle.getBuf();
-  }
-  {
-    CpuMemoryHandle handle(256);
-    ptr2 = handle.getBuf();
-  }
-  EXPECT_EQ(ptr1, ptr2);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(MemoryHandle, Gpu) {
-  int numGpu = hl_get_device_count();
-
-  /* alloc from system memory */
-  void* ptr3[numGpu];
-  void* ptr4[numGpu];
-  for (int i = 0; i < numGpu; i++) {
-    SetDevice device(i);
-    GpuMemoryHandle handle1(30);
-    GpuMemoryHandle handle2(30);
-    GpuMemoryHandle handle3(4000);
-    GpuMemoryHandle handle4(500);
-    ptr3[i] = handle3.getBuf();
-    ptr4[i] = handle4.getBuf();
-  }
-
-  /* alloc from pool */
-  for (int i = 0; i < numGpu; i++) {
-    SetDevice device(i);
-    GpuMemoryHandle handle1(30);
-    GpuMemoryHandle handle3(4000);
-    GpuMemoryHandle handle4(500);
-    EXPECT_EQ(ptr3[i], handle3.getBuf());
-    EXPECT_EQ(ptr4[i], handle4.getBuf());
-  }
-}
-#endif
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
deleted file mode 100644
index 6f7beb60c8f535d51b18c4984b89d1972f4c82bd..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/**
- * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
- * implementation of CPU and GPU member function in
- * BaseMatrix.cpp and Matrix.cpp.
- */
-
-#include <gtest/gtest.h>
-#include "TestUtils.h"
-#include "paddle/math/BaseMatrix.h"
-
-using paddle::BaseMatrix;
-using paddle::Matrix;
-using autotest::AutoCompare;
-
-// Test all void (BaseMatrix::*)() function
-TEST(BaseMatrix, void) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)()) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg(f, height, width);
-      };
-
-      compare(&BaseMatrix::neg);
-      compare(&BaseMatrix::exp2);
-      compare(&BaseMatrix::log2);
-      compare(&BaseMatrix::sqrt2);
-      compare(&BaseMatrix::square2);
-      compare(&BaseMatrix::reciprocal2);
-      compare(&BaseMatrix::abs2);
-      compare(&BaseMatrix::sign2);
-      compare(&BaseMatrix::zero);
-      compare(&BaseMatrix::one);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(real) function
-TEST(BaseMatrix, real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0>(f, height, width);
-      };
-
-      compare(&BaseMatrix::pow2);
-      compare(&BaseMatrix::subScalar);
-      compare(&BaseMatrix::mulScalar);
-      compare(&BaseMatrix::divScalar);
-      compare(&BaseMatrix::assign);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::biggerThanScalar);
-      compare(&BaseMatrix::downClip);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&) function
-TEST(BaseMatrix, BaseMatrix) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0>(f, height, width);
-      };
-
-      compare(&BaseMatrix::assign);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::relu);
-      compare(&BaseMatrix::reluDerivative);
-      compare(&BaseMatrix::softrelu);
-      compare(&BaseMatrix::softreluDerivative);
-      compare(&BaseMatrix::brelu);
-      compare(&BaseMatrix::breluDerivative);
-      compare(&BaseMatrix::square2);
-      compare(&BaseMatrix::squareDerivative);
-      compare(&BaseMatrix::tanh);
-      compare(&BaseMatrix::tanhDerivative);
-      compare(&BaseMatrix::reciprocal2);
-      compare(&BaseMatrix::reciprocalDerivative);
-      compare(&BaseMatrix::abs2);
-      compare(&BaseMatrix::absDerivative);
-      compare(&BaseMatrix::sigmoid);
-      compare(&BaseMatrix::sigmoidDerivative);
-      compare(&BaseMatrix::expDerivative);
-      compare(&BaseMatrix::sign2);
-      compare(&BaseMatrix::exp2);
-      compare(&BaseMatrix::log2);
-      compare(&BaseMatrix::sqrt2);
-      compare(&BaseMatrix::dotMul);
-      compare(&BaseMatrix::dotMulSquare);
-      compare(&BaseMatrix::dotSquareMul);
-      compare(&BaseMatrix::addColVector);
-      compare(&BaseMatrix::addRowVector);
-      compare(&BaseMatrix::mulRowVector);
-      compare(&BaseMatrix::divRowVector);
-      compare(&BaseMatrix::mulColVector);
-      compare(&BaseMatrix::divColVector);
-      compare(&BaseMatrix::addP2P);
-      compare(&BaseMatrix::invSqrt);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(real, real) function
-TEST(BaseMatrix, real_real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(real, real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::clip);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&, real) function
-TEST(BaseMatrix, BaseMatrix_real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&, real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::addBias);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::sub);
-      compare(&BaseMatrix::pow2);
-      compare(&BaseMatrix::addScalar);
-      compare(&BaseMatrix::subScalar);
-      compare(&BaseMatrix::mulScalar);
-      compare(&BaseMatrix::divScalar);
-      compare(&BaseMatrix::scalarDiv);
-      compare(&BaseMatrix::addSquare);
-      compare(&BaseMatrix::isEqualTo);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&, BaseMatrix&) function
-TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height,
-                      width](void (BaseMatrix::*f)(BaseMatrix&, BaseMatrix&)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::softCrossEntropy);
-      compare(&BaseMatrix::softCrossEntropyBp);
-      compare(&BaseMatrix::binaryLabelCrossEntropy);
-      compare(&BaseMatrix::binaryLabelCrossEntropyBp);
-      compare(&BaseMatrix::sub);
-      compare(&BaseMatrix::add2);
-      compare(&BaseMatrix::dotMul);
-      compare(&BaseMatrix::dotDiv);
-      compare(&BaseMatrix::logisticRegressionLoss);
-      compare(&BaseMatrix::logisticRegressionLossBp);
-      compare(&BaseMatrix::biggerThan);
-      compare(&BaseMatrix::max2);
-      compare(&BaseMatrix::dotMulSquare);
-      compare(&BaseMatrix::dotSquareSquare);
-    }
-  }
-}
-
-void TestEelementWise(size_t height, size_t width) {
-  AutoCompare rowScale(height, width);
-  rowScale.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowScale, height, width);
-
-  AutoCompare rowDotMul(height, width);
-  rowDotMul.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowDotMul, height, width);
-
-  AutoCompare binaryClassificationError(height, width);
-  binaryClassificationError.cmpWithoutArg<0, 1, 2, 3>(
-      &BaseMatrix::binaryClassificationError, height, width);
-
-  AutoCompare sumOfSquaresBp(height, width);
-  sumOfSquaresBp.cmpWithoutArg<0, 1>(&Matrix::sumOfSquaresBp, height, width);
-}
-
-void TestAggregateToRow(size_t height, size_t width) {
-  AutoCompare maxCols(1, width);
-  maxCols.cmpWithoutArg<0>(&BaseMatrix::maxCols, height, width);
-
-  AutoCompare minCols(1, width);
-  minCols.cmpWithoutArg<0>(&BaseMatrix::minCols, height, width);
-
-  AutoCompare addDotMulVMM(1, width);
-  addDotMulVMM.cmpWithoutArg<0, 1>(&BaseMatrix::addDotMulVMM, height, width);
-
-  AutoCompare sumCols(1, width);
-  sumCols.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumCols, height, width);
-
-  AutoCompare collectBias(1, width);
-  collectBias.cmpWithoutArg<0, 1>(
-      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::collectBias),
-      height,
-      width);
-}
-
-void TestAggregateToCol(size_t height, size_t width) {
-  AutoCompare maxRows(height, 1);
-  maxRows.cmpWithoutArg<0>(&BaseMatrix::maxRows, height, width);
-
-  AutoCompare minRows(height, 1);
-  minRows.cmpWithoutArg<0>(&BaseMatrix::minRows, height, width);
-
-  AutoCompare sumRows(height, 1);
-  sumRows.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumRows, height, width);
-
-  AutoCompare sumOfSquares(height, 1);
-  sumOfSquares.cmpWithoutArg<0, 1>(&Matrix::sumOfSquares, height, width);
-}
-
-TEST(BaseMatrix, Other) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      TestEelementWise(height, width);
-      TestAggregateToRow(height, width);
-      TestAggregateToCol(height, width);
-    }
-  }
-}
-
-#endif
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
deleted file mode 100644
index 395541a76ae5e5497fdaa8b4870e421cbf62608a..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <gtest/gtest.h>
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Util.h"
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-
-TEST(CpuGpuVector, getData) {
-  size_t size = 500;
-  hl_stream_t stream(HPPL_STREAM_DEFAULT);
-  CpuVectorPtr cpuVec = std::make_shared<CpuVector>(size);
-  GpuVectorPtr gpuVec = std::make_shared<GpuVector>(size);
-  cpuVec->uniform(0.0, 10.0);
-  gpuVec->copyFrom(*cpuVec, stream);
-  hl_stream_synchronize(stream);
-
-  CpuGpuVectorPtr vec = std::make_shared<CpuGpuVector>(gpuVec);
-  auto a = vec->getData(false);
-  auto b = cpuVec->getData();
-  hl_stream_synchronize(stream);
-  checkDataEqual(a, b, size);
-}
-
-TEST(CpuGpuVector, subCreate) {
-  size_t size1 = 1024;
-  size_t offset = 100;
-  size_t size2 = 500;
-  hl_stream_t stream(HPPL_STREAM_DEFAULT);
-  CpuGpuVectorPtr v1 = std::make_shared<CpuGpuVector>(size1, /*useGpu*/ false);
-  auto vec = v1->getMutableVector(false);
-  vec->uniform(0.0, 10.0);
-  auto v2 = std::make_shared<CpuGpuVector>(*v1, offset, size2);
-  CHECK_EQ(*v1->getSync(), *v2->getSync());
-
-  // check subVec equal
-  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
-
-  CpuVectorPtr v1Check = std::make_shared<CpuVector>(size1);
-  CpuVectorPtr v2Check = std::make_shared<CpuVector>(size2);
-  v1Check->copyFrom(*(v1->getVector(true)), stream);
-  v2Check->copyFrom(*(v2->getVector(true)), stream);
-  hl_stream_synchronize(stream);
-
-  checkDataEqual(v2->getData(false), v2Check->getData(), size2);
-  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
-
-  CpuVectorPtr noise = std::make_shared<CpuVector>(size2);
-  noise->uniform(0.0, 1.0);
-  auto v = v2->getMutableVector(false);  // will change header
-  // add noise to subVec
-  v->add(*noise);
-
-  // check v1_cpu_data == v2_cpu_data
-  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
-
-  v1Check->copyFrom(*(v1->getVector(true)), stream);
-  v2Check->copyFrom(*(v2->getVector(true)), stream);
-  hl_stream_synchronize(stream);
-
-  // check v1_gpu_data == v2_gpu_data
-  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
-}
-
-#endif
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
deleted file mode 100644
index 72256cb9d4c93159418d27c7ca0d4f8b9a412a64..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <paddle/utils/Util.h>
-#include <vector>
-#include "paddle/math/SparseMatrix.h"
-
-using namespace paddle;  // NOLINT
-
-const int height = 10;
-const int width = 16;
-
-real f(Matrix& mat1,
-       const Matrix& mat2,
-       IVector& vec1,
-       const IVector& vec2,
-       real scalar) {
-  CHECK(!mat1.useGpu());
-  CHECK(!mat2.useGpu());
-  CHECK(!vec1.useGpu());
-  CHECK(!vec2.useGpu());
-  mat1.copyFrom(mat2);
-  vec1.copyFrom(vec2);
-
-  return scalar;
-}
-
-class Functor {
- public:
-  real operator()(Matrix& mat1,
-                  const Matrix& mat2,
-                  IVector& vec1,
-                  const IVector& vec2,
-                  real scalar) {
-    a_ = f(mat1, mat2, vec1, vec2, scalar);
-    return a_;
-  }
-
- private:
-  real a_;
-};
-
-template <typename F>
-void testWrapper(F&& f) {
-  MatrixPtr cpumat1 = Matrix::create(height, width, false, /*useGpu=*/false);
-  MatrixPtr cpumat2 = Matrix::create(height, width, false, /*useGpu=*/false);
-
-  IVectorPtr cpuvec1 = IVector::create(height, /*useGpu=*/false);
-  IVectorPtr cpuvec2 = IVector::create(height, /*useGpu=*/false);
-
-  const real scalar = 1.23456;
-
-  MatrixPtr gpumat1 = Matrix::create(height, width, false, /*useGpu=*/true);
-  MatrixPtr gpumat2 = Matrix::create(height, width, false, /*useGpu=*/true);
-  IVectorPtr gpuvec1 = IVector::create(height, /*useGpu=*/true);
-  IVectorPtr gpuvec2 = IVector::create(height, /*useGpu=*/true);
-
-  cpumat2->randomizeUniform();
-  cpuvec2->rand(width);
-  gpumat2->copyFrom(*cpumat2);
-  gpuvec2->copyFrom(*cpuvec2);
-
-  real ret = execViaCpu(f, *gpumat1, *gpumat2, *gpuvec1, *gpuvec2, 1.23456);
-  EXPECT_EQ(ret, scalar);
-  cpumat1->copyFrom(*gpumat1);
-  cpuvec1->copyFrom(*gpuvec1);
-
-  for (int i = 0; i < height; ++i) {
-    EXPECT_EQ(cpuvec1->getElement(i), cpuvec2->getElement(i));
-    for (int j = 0; j < width; ++j) {
-      EXPECT_EQ(cpumat1->getElement(i, j), cpumat2->getElement(i, j));
-    }
-  }
-  gpumat1->resize(height, 1);
-  execViaCpu2(&CpuMatrix::selectElements, *gpumat1, *gpumat2, *gpuvec1);
-
-  cpumat1->resize(height, 1);
-  cpumat1->selectElements(*cpumat2, *cpuvec1);
-  for (int i = 0; i < height; ++i) {
-    EXPECT_EQ(cpumat1->getElement(i, 0), gpumat1->getElement(i, 0));
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(ExecViaCpu, test1) {
-  testWrapper(f);
-  testWrapper(&f);
-
-  auto lambda = [](Matrix& mat1,
-                   const Matrix& mat2,
-                   IVector& vec1,
-                   const IVector& vec2,
-                   real scalar) -> real {
-    return f(mat1, mat2, vec1, vec2, scalar);
-  };
-  LOG(INFO) << "lambda is_class=" << std::is_class<decltype(lambda)>::value
-            << " is_function=" << std::is_function<decltype(lambda)>::value;
-  testWrapper(lambda);
-
-  Functor functor;
-  testWrapper(functor);
-}
-#endif
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
deleted file mode 100644
index d87fdcda9edc8644301b7fe77f4c0c751d5a774a..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_FPException.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/**
- * This test is about floating point calculation exception.
- * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
- *
- * Some exceptions occur in the middle of a set of formulas,
- * that can be circumvented by some tricks.
- * For example,
- * calculate tanh
- *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
- *
- * If the result of (-2 * a) is too large,
- * a FE_OVERFLOW exception occurs when calculating exp.
- * But the result of tanh is no overflow problem,
- * so we can add some tricks to prevent exp calculate an excessive value.
- *
- */
-
-#include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Common.h"
-
-using namespace paddle;  // NOLINT
-
-void SetTensorValue(Matrix& matrix, real value) {
-  int height = matrix.getHeight();
-  int width = matrix.getWidth();
-  int stride = matrix.getStride();
-  real* data = matrix.getData();
-  for (int i = 0; i < height; i++) {
-    int j = rand() % width;  // NOLINT
-    if (typeid(matrix) == typeid(CpuMatrix)) {
-      data[i * stride + j] = value;
-    } else if (typeid(matrix) == typeid(GpuMatrix)) {
-      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-template <typename Matrix>
-void testTanh(real illegal) {
-  MatrixPtr A = std::make_shared<Matrix>(10, 10);
-  MatrixPtr B = std::make_shared<Matrix>(10, 10);
-  A->randomizeUniform();
-  B->randomizeUniform();
-
-  SetTensorValue(*A, illegal);
-
-  A->tanh(*B);
-}
-
-template <typename Matrix>
-void testSigmoid(real illegal) {
-  MatrixPtr A = std::make_shared<Matrix>(10, 10);
-  MatrixPtr B = std::make_shared<Matrix>(10, 10);
-  A->randomizeUniform();
-  B->randomizeUniform();
-
-  SetTensorValue(*A, illegal);
-
-  A->sigmoid(*B);
-}
-
-TEST(fp, overflow) {
-  for (auto illegal : {-90.0, 90.0}) {
-    LOG(INFO) << " illegal=" << illegal;
-    testTanh<CpuMatrix>(illegal);
-    testSigmoid<CpuMatrix>(illegal);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
deleted file mode 100644
index 828159660bae1ad1c0b56fd7202f0357549877ca..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/testing/TestUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (fabs(a - b) > err) {
-        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
-          count++;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void testBilinearFwdBwd(int numSamples,
-                        int imgSizeH,
-                        int imgSizeW,
-                        int channels) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
-  real ratioH = 0.5;
-  real ratioW = 0.5;
-
-  // forward
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-
-  input->randomizeUniform();
-  inputGpu->copyFrom(*input);
-
-  {
-    // nvprof: GPU Proflier
-    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
-    target->bilinearForward(*input,
-                            imgSizeH,
-                            imgSizeW,
-                            2 * imgSizeH,
-                            2 * imgSizeW,
-                            channels,
-                            ratioH,
-                            ratioW);
-    targetGpu->bilinearForward(*inputGpu,
-                               imgSizeH,
-                               imgSizeW,
-                               2 * imgSizeH,
-                               2 * imgSizeW,
-                               channels,
-                               ratioH,
-                               ratioW);
-  }
-
-  // check
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
-
-  // backward
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheckGrad =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->bilinearBackward(*targetGrad,
-                              2 * imgSizeH,
-                              2 * imgSizeW,
-                              imgSizeH,
-                              imgSizeW,
-                              channels,
-                              ratioH,
-                              ratioW);
-  inputGpuGrad->bilinearBackward(*targetGpuGrad,
-                                 2 * imgSizeH,
-                                 2 * imgSizeW,
-                                 imgSizeH,
-                                 imgSizeW,
-                                 channels,
-                                 ratioH,
-                                 ratioW);
-
-  // check
-  targetCheckGrad->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetCheckGrad);
-}
-
-TEST(Profiler, testBilinearFwdBwd) {
-  auto numSamples = 10;
-  auto channels = 16;
-  auto imgSize = 64;
-  {
-    // nvprof: GPU Proflier
-    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
-    // Paddle built-in timer
-    REGISTER_TIMER_INFO(
-        "testBilinearFwdBwd",
-        "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
-    testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
-  }
-  globalStat.printAllStatus();
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-
-  // nvprof: GPU Proflier
-  REGISTER_GPU_PROFILER(
-      "RecursiveProfilingTest",
-      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
-
-  return RUN_ALL_TESTS();
-}
-
-#endif
diff --git a/paddle/math/tests/test_RowBuffer.cpp b/paddle/math/tests/test_RowBuffer.cpp
deleted file mode 100644
index e38de853e03874be3fd3582f7b39b1d490886d78..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_RowBuffer.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/math/RowBuffer.h"
-
-TEST(RowBuffer, testAutoGrow) {
-  paddle::RowBuffer buf(128);
-  ASSERT_EQ(128UL, buf.getWidth());
-  ASSERT_TRUE(buf.isAutoGrowth());
-  buf.resize(2);
-  ASSERT_EQ(2UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
-    buf.data()[i] = i;
-  }
-  for (size_t i = 0; i < buf.getRowCount(); ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
-    }
-  }
-
-  auto data = buf.getWithAutoGrowth(2);
-  for (size_t i = 0; i < buf.getWidth(); ++i) {
-    data[i] = i;
-  }
-
-  ASSERT_EQ(3UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getRowCount() - 1; ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
-    }
-  }
-  for (size_t i = 0; i < buf.getWidth(); ++i) {
-    ASSERT_NEAR(i, buf.get(2)[i], 1e-5);
-  }
-}
-
-TEST(RowBuffer, testWithMemBuf) {
-  paddle::CpuMemHandlePtr mem =
-      std::make_shared<paddle::CpuMemoryHandle>(128 * 2 * sizeof(real));
-  paddle::RowBuffer buf(mem, 128);
-  ASSERT_TRUE(!buf.isAutoGrowth());
-  ASSERT_EQ(2UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
-    buf.data()[i] = i;
-  }
-  for (size_t i = 0; i < buf.getRowCount(); ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.getWithAutoGrowth(i)[j], 1e-5);
-    }
-  }
-
-  ASSERT_DEATH_IF_SUPPORTED(buf.getWithAutoGrowth(3), ".*");
-}
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
deleted file mode 100644
index b692679436ee7bd3b8c4a675e969e15b065cc534..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/math/SIMDFunctions.h"
-#include "paddle/utils/Util.h"
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <random>
-
-#include <stdlib.h>
-#include <time.h>
-
-static constexpr size_t VECTOR_LEN = 3072;
-static constexpr size_t BATCH_SIZE = 64;
-static constexpr size_t ALIGN = 32;
-static_assert(VECTOR_LEN % ALIGN == 0, "VECTOR_LEN % ALIGN == 0");
-static_assert(BATCH_SIZE % ALIGN == 0, "BATCH_SIZE % ALIGN == 0");
-static constexpr float EPSILON = 1e-5;
-static std::mt19937 RandomEngine(time(0));
-
-inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
-                                                 size_t align = ALIGN) {
-  float* ptr;
-  CHECK_EQ(posix_memalign((void**)&ptr, align, len * sizeof(float)), 0);
-  return std::unique_ptr<float[]>(ptr);
-}
-
-inline static std::unique_ptr<float[]> NewRandomVector(size_t len = VECTOR_LEN,
-                                                       size_t align = ALIGN) {
-  std::uniform_real_distribution<float> dist(-100.0f, 100.0f);
-  auto generator = std::bind(dist, RandomEngine);
-  auto retv = NewVector(len, align);
-  std::generate_n(retv.get(), len, generator);
-  return retv;
-}
-
-TEST(SIMDFunction, addTo) {
-  typedef std::function<void(float*, const float*, size_t)> AddToMethodType;
-
-  AddToMethodType naive = paddle::simd::naive::addTo<float>;
-  AddToMethodType simd = paddle::simd::addTo<float>;
-
-  auto A = NewRandomVector();
-  auto B = NewRandomVector();
-
-  auto ACopy = NewVector();
-  memcpy(ACopy.get(), A.get(), VECTOR_LEN * sizeof(float));
-
-  naive(A.get(), B.get(), VECTOR_LEN);
-  simd(ACopy.get(), B.get(), VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, batchAddTo) {
-  auto A = NewRandomVector();
-  auto ACopy = NewVector();
-  memcpy(ACopy.get(), A.get(), sizeof(float) * VECTOR_LEN);
-
-  std::vector<std::unique_ptr<float[]>> B;
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    B.emplace_back(NewRandomVector());
-  }
-  std::unique_ptr<float* []> BRaw(new float*[BATCH_SIZE]);
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    BRaw[i] = B[i].get();
-  }
-
-  typedef std::function<void(float*, const float**, int, size_t)>
-      BatchAddToMethodType;
-
-  BatchAddToMethodType naive = paddle::simd::naive::batchAddTo<float>;
-  BatchAddToMethodType simd = paddle::simd::batchAddTo<float>;
-
-  naive(A.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
-  simd(ACopy.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, colMax) {
-  auto A = NewRandomVector(VECTOR_LEN * BATCH_SIZE);
-  auto naiveResult = NewVector(BATCH_SIZE);
-  auto simdResult = NewVector(BATCH_SIZE);
-
-  typedef std::function<void(float*, const float*, int, int)> ColMaxMethodType;
-  ColMaxMethodType naive = paddle::simd::naive::colMax<float>;
-  ColMaxMethodType simd = paddle::simd::colMax<float>;
-
-  naive(naiveResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
-  simd(simdResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
-
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    ASSERT_NEAR(naiveResult[i], simdResult[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, decayL1_WithLR) {
-  auto dest = NewRandomVector();
-  auto src = NewRandomVector();
-  auto lr = NewRandomVector();
-  auto lambda = 0.23f;
-
-  auto simd_dest = NewVector();
-  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
-
-  typedef std::function<void(float*, float*, float*, float, size_t)>
-      DecayL1MethodType;
-
-  DecayL1MethodType naive = [](
-      float* d, float* s, float* lr, float l, size_t len) {
-    paddle::simd::naive::decayL1<float>(d, s, lr, l, len);
-  };
-
-  DecayL1MethodType simd = [](
-      float* d, float* s, float* lr, float l, size_t len) {
-    paddle::simd::decayL1<float>(d, s, lr, l, len);
-  };
-
-  naive(dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
-  simd(simd_dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, decayL1_WithoutLR) {
-  auto dest = NewRandomVector();
-  auto src = NewRandomVector();
-  auto lambda = 0.23;
-
-  auto simd_dest = NewVector();
-  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
-
-  typedef std::function<void(float*, float*, float, size_t)> DecayL1MethodType;
-
-  DecayL1MethodType naive = [](float* d, float* s, float l, size_t len) {
-    paddle::simd::naive::decayL1<float>(d, s, l, len);
-  };
-
-  DecayL1MethodType simd = [](float* d, float* s, float l, size_t len) {
-    paddle::simd::decayL1<float>(d, s, l, len);
-  };
-
-  naive(dest.get(), src.get(), lambda, VECTOR_LEN);
-  simd(simd_dest.get(), src.get(), lambda, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
-  }
-}
diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu
deleted file mode 100644
index acb2da86d0f41d12fced97d1ddaf5be00959fb82..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_Tensor.cu
+++ /dev/null
@@ -1,1162 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/math/Matrix.h"
-
-using paddle::Matrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::CpuVector;
-using paddle::GpuVector;
-using paddle::CpuIVector;
-using paddle::GpuIVector;
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-#define INIT_UNARY(A1, A2)  \
-  Tensor A1(height, width); \
-  Tensor A2(height, width); \
-  A1.randomizeUniform();    \
-  A2.copyFrom(A1)
-#define INIT_BINARY(A1, A2, B) \
-  INIT_UNARY(A1, A2);          \
-  Tensor B(height, width);     \
-  B.randomizeUniform()
-#define INIT_TERNARY(A1, A2, B, C) \
-  INIT_BINARY(A1, A2, B);          \
-  Tensor C(height, width);         \
-  C.randomizeUniform()
-#define INIT_QUATERNARY(A1, A2, B, C, D) \
-  INIT_TERNARY(A1, A2, B, C);            \
-  Tensor D(height, width);               \
-  D.randomizeUniform()
-
-template <typename Tensor>
-struct TestUnaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
-
-  explicit TestUnaryMatrix(UnaryFunc testUnaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_UNARY(A1, A2);
-        testUnaryFunc(A1, A2);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestBinaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
-
-  explicit TestBinaryMatrix(BinaryFunc testBinaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_BINARY(A1, A2, B);
-        testBinaryFunc(A1, A2, B);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestTernaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)>
-      TernaryFunc;
-
-  explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_TERNARY(A1, A2, B, C);
-        testTernaryFunc(A1, A2, B, C);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestQuaternaryMatrix {
-  typedef std::function<void(
-      Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)>
-      QuaternaryFunc;
-
-  explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_QUATERNARY(A1, A2, B, C, D);
-        testQuaternaryFunc(A1, A2, B, C, D);
-      }
-    }
-  }
-};
-
-template <typename Tensor, class T>
-struct TestUnaryVectorT {
-  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
-
-  explicit TestUnaryVectorT(UnaryFunc testUnaryFunc) {
-    for (auto size : {1, 11, 73, 128, 200, 330, 512, 1000, 4210}) {
-      LOG(INFO) << " size=" << size;
-      Tensor A1(size);
-      Tensor A2(size);
-      if (typeid(T) == typeid(real)) {
-        A1.rand();
-      } else {
-        A1.rand(1000);
-      }
-      A2.copyFrom(A1);
-      testUnaryFunc(A1, A2);
-    }
-  }
-};
-
-void SetTensorValue(Matrix& matrix, real value) {
-  int height = matrix.getHeight();
-  int width = matrix.getWidth();
-  int stride = matrix.getStride();
-  real* data = matrix.getData();
-  for (int i = 0; i < height; i++) {
-    int j = rand() % width;  // NOLINT
-    if (typeid(matrix) == typeid(CpuMatrix)) {
-      data[i * stride + j] = value;
-    } else if (typeid(matrix) == typeid(GpuMatrix)) {
-      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
-    } else {
-    }
-  }
-}
-
-template <typename Tensor>
-void testTensorAddScalar(Tensor& A1, Tensor& A2) {
-  real p1 = 2.5;
-  real p2 = 3.0;
-  A1.add(p1);  // a += p
-  A2 += p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(p1, p2);  // a = a * p1 + p2
-  A2 = A2 * p1 + p2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSubScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.subScalar(p);  // a -= p
-  A2 -= p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMulScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.mulScalar(p);  // a *= p
-  A2 *= p;
-  TensorCheckEqual(A1, A2);
-
-  real learningRate = 0.7f;
-  real decayRate = 1.2f;
-  A1.applyL2(learningRate, decayRate);
-  A2 = A2 * (1.0f / (1.0f + learningRate * decayRate));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDivScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.divScalar(p);  // a /= p
-  A2 /= p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorNeg(Tensor& A1, Tensor& A2) {
-  A1.neg();  // a = -a
-  A2 = -A2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbs(Tensor& A1, Tensor& A2) {
-  A1.abs2();  // a = a > 0 ? a : -a
-  A2 = A2.abs();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquare(Tensor& A1, Tensor& A2) {
-  A1.square2();  // a = a * a
-  A2 = A2.square();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2) {
-  A1.reciprocal2();  // a = 1.0f / a
-  A2 = A2.reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSign(Tensor& A1, Tensor& A2) {
-  A1.sign2();  // a = (a > 0) - (a < 0)
-  A2 = A2.sign();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAssign(Tensor& A1, Tensor& A2) {
-  A1.assign(1.5);  // a = p
-  A2 = A2.constant(1.5);
-  TensorCheckEqual(A1, A2);
-
-  A1.one();  // a = 1
-  A2 = A2.constant(1.0);
-  TensorCheckEqual(A1, A2);
-
-  A1.zero();  // a = 0
-  A2 = A2.constant(0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
-  testTensorAddScalar(A1, A2);
-  testTensorSubScalar(A1, A2);
-  testTensorMulScalar(A1, A2);
-  testTensorDivScalar(A1, A2);
-  testTensorNeg(A1, A2);
-  testTensorAbs(A1, A2);
-  testTensorSquare(A1, A2);
-  testTensorReciprocal(A1, A2);
-  testTensorSign(A1, A2);
-  testTensorAssign(A1, A2);
-}
-
-template <typename Tensor>
-void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
-  A1.add(2);  // a += p
-  A2 += 2;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(3, 2);  // a = a * p1 + p2
-  A2 = A2 * 3 + 2;
-  TensorCheckEqual(A1, A2);
-
-  testTensorNeg(A1, A2);
-  testTensorAbs(A1, A2);
-}
-
-TEST(Unary, BaseOp) {
-  TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
-  TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
-  TestUnaryVectorT<CpuIVector, int> testCpuIVector(
-      testUnaryBaseOpInt<CpuIVector>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
-  TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
-  TestUnaryVectorT<GpuIVector, int> testGpuIVector(
-      testUnaryBaseOpInt<GpuIVector>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorExp(Tensor& A1, Tensor& A2) {
-  A1.exp2();  // a = exp(a)
-  A2 = A2.exp();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLog(Tensor& A1, Tensor& A2) {
-  A1.log2();  // a = log(a)
-  A2 = A2.log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSqrt(Tensor& A1, Tensor& A2) {
-  A1.sqrt2();  // a = sqrt(a)
-  A2 = A2.sqrt();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorPow(Tensor& A1, Tensor& A2) {
-  A1.pow2(3.2);  // a = pow(a, p)
-  A2 = A2.pow(3.2);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testUnayrMathOp(Tensor& A1, Tensor& A2) {
-  testTensorExp(A1, A2);
-  testTensorLog(A1, A2);
-  testTensorSqrt(A1, A2);
-  testTensorPow(A1, A2);
-}
-
-TEST(Unary, MathOp) {
-  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorClip(Tensor& A1, Tensor& A2) {
-  real p1 = 0.003f;
-  real p2 = 0.877f;
-  A1.clip(p1, p2);  // a = a < p1 ? p1 : (a > p2 ? p2 : a)
-  // A2 = A2.min(0.877f).max(0.003f);
-  A2 = (A2 < p1).condition(p1, (A2 > p2).condition(p2, A2));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
-  real p = 0.5f;
-  A1.biggerThanScalar(p);  // a = a > p ? 1.0f : 0.0f
-  A2 = (A2 > p).condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorapplyL1(Tensor& A1, Tensor& A2) {
-  /**
-   * T lambda = p;
-   * a = (a > lambda) ? (a - lambda)
-   *                  : (a < -lambda) ? (a + lambda) : 0
-   *
-   * p = learningRate * decayRate;
-   */
-  real learningRate = 0.7f;
-  real decayRate = 0.6f;
-  A1.applyL1(learningRate, decayRate);
-  A2 = (A2 > (learningRate * decayRate))
-           .condition(
-               (A2 - (learningRate * decayRate)),
-               (A2 < -(learningRate * decayRate))
-                   .condition((A2 + (learningRate * decayRate)), (real)0.0));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
-  testTensorClip(A1, A2);
-  testTensorBiggerThanScalar(A1, A2);
-
-  A1.randomizeUniform();
-  A1.subScalar(0.5f);
-  A2.copyFrom(A1);
-  testTensorapplyL1(A1, A2);
-}
-
-TEST(Unary, CompareOp) {
-  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.2;
-  A1.add(B);  // a += b
-  A2 += B;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(B, p1);  // a += b * p
-  A2 += B * p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(B, p1, p2);  // a = p1 * a + p2 * b
-  A2 = A2 * p1 + B * p2;
-  TensorCheckEqual(A1, A2);
-
-  A1.addScalar(B, p1);  // a = b + p
-  A2 = B + p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.addSquare(B, p1);  // a += p * b * b
-  A2 += B.constant(p1) * B * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.decayAddSquare(B, p1, p2);  // a = p1 * a + p2 * b * b
-  A2 = A2 * p1 + B.constant(p2) * B * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.sub(B);  // a -= b
-  A2 -= B;
-  TensorCheckEqual(A1, A2);
-
-  A1.sub(B, p);  // a -= b * p
-  A2 -= B * p;
-  TensorCheckEqual(A1, A2);
-
-  A1.subScalar(B, p);  // a = b - p
-  A2 = B - p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.mulScalar(B, p);  // a = b * p
-  A2 = B * p;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMulSquare(B);  // a *= b * b
-  A2 *= B * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotSquareMul(B);  // a = a * a * b
-  A2 = A2 * A2 * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMul(B);  // a *= b
-  A2 *= B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.divScalar(B, p);  // a = b / p
-  A2 = B / p;
-  TensorCheckEqual(A1, A2);
-
-  A1.scalarDiv(B, p);  // a = p / b
-  A2 = B.constant(p) / B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.assign(B);  // a = b
-  A2 = B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.square2(A1);  // b = a * a
-  A2 = B.square();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.squareDerivative(B);  // a *= 2.0 * b
-  A2 = A2 * (real)2.0 * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.reciprocal2(A1);  // b = 1.0f / a
-  A2 = B.reciprocal();
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 0.58;
-  real p2 = 0.32;
-  A1.reciprocal2(B, p1, p2);  // a = 1 / (p1 * b + p2)
-  A2 = (B * p1 + p2).reciprocal();
-  TensorCheckEqual(A1, A2);
-
-  real learningRate = 0.7f;
-  real decayRate = 1.2f;
-  A1.applyL2(B, learningRate, decayRate);  // a *= (1.0f / (1.0f + p * b))
-  A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B)
-            .reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.reciprocalDerivative(B);  // a *= -b * b
-  A2 *= (-B) * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.sign2(A1);  // b = a > 0.0f ? 1.0f : -1.0f
-  A2 = B.sign();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.abs2(A1);  // b = a > 0.0f ? a : -a
-  A2 = B.abs();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  testTensorAdd(A1, A2, B);
-  testTensorSub(A1, A2, B);
-  testTensorMul(A1, A2, B);
-  testTensorDiv(A1, A2, B);
-  testTensorSquare(A1, A2, B);
-  testTensorSquareDerivative(A1, A2, B);
-  testTensorReciprocal(A1, A2, B);
-  testTensorReciprocalDerivative(A1, A2, B);
-  testTensorAbs(A1, A2, B);
-  testTensorSign(A1, A2, B);
-  testTensorAssign(A1, A2, B);
-}
-
-TEST(Binary, BaseOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = exp(b)
-  A1.exp2(B);
-  A2 = B.exp();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.expDerivative(B);  // a *= b
-  A2 *= B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = log(b)
-  A1.log2(B);
-  A2 = B.log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = sqrt(b)
-  A1.sqrt2(B);
-  A2 = B.sqrt();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = 1.0f / sqrt(b)
-  A1.invSqrt(B);
-  A2 = B.sqrt().reciprocal();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.pow2(B, 2.5f);  // a = pow(b, p)
-  A2 = B.pow(2.5f);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * const T THRESHOLD = 40.0;
-   * b = log(1.0 +
-   *         exp((a > THRESHOLD) ? THRESHOLD
-   *             : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))
-   */
-  B.softrelu(A1);
-
-  real THRESHOLD = 40.0;
-  A2 = (B.constant(1.0f) +
-        (B > THRESHOLD)
-            .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))
-            .exp())
-           .log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * const T THRESHOLD = 40.0;
-   * a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
-   *                             ? THRESHOLD
-   *                             : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
-   */
-  A1.softreluDerivative(B);
-  real THRESHOLD = 40.0;
-  A2 = A2 *
-       (B.constant(1.0f) -
-        (B.constant(-1.0f) *
-         (B > THRESHOLD)
-             .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)))
-            .exp());
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-    const T THRESHOLD_MIN = -40.0;
-    const T THRESHOLD_MAX = 13.0;
-    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
-            : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
-    b = 1.0f / (1.0f + exp(-tmp)))
-   */
-  B.sigmoid(A1);
-
-  const real THRESHOLD_MIN = -40.0;
-  const real THRESHOLD_MAX = 13.0;
-  auto tmp = (B < THRESHOLD_MIN)
-                 .condition(THRESHOLD_MIN,
-                            (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
-  A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.sigmoidDerivative(B);  // a *= b * (1 - b)
-  A2 *= B * (B.constant(1.0f) - B);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.tanh(A1);  // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
-  A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.tanhDerivative(B);  // a *= 1 - b * b
-  A2 *= B.constant(1.0f) - B * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.1;
-  // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
-  B.scaledTanh(A1, p1, p2);
-  A2 = B.constant(p1) *
-       (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) -
-        (real)1.0);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.1;
-  // a *= (p2 / p1) * (p1 * p1 - b * b));
-  A1.scaledTanhDerivative(B, p1, p2);
-  A2 = A2 * (B.constant(p2 / p1) * (B.constant(p1 * p1) - B * B));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  testTensorTanhDerivative(A1, A2, B);
-  testTensorScaledTanhDerivative(A1, A2, B);
-  testTensorSigmoidDerivative(A1, A2, B);
-  testTensorExpDerivative(A1, A2, B);
-  testTensorScaledTanh(A1, A2, B);
-  testTensorTanh(A1, A2, B);
-  testTensorExp(A1, A2, B);
-  testTensorLog(A1, A2, B);
-  testTensorSqrt(A1, A2, B);
-  testTensorInvSqrt(A1, A2, B);
-  testTensorPow(A1, A2, B);
-
-  testTensorSoftrelu(A1, A2, B);
-  testTensorSoftreluDerivative(A1, A2, B);
-  testTensorSigmoid(A1, A2, B);
-}
-
-TEST(Binary, MathOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.relu(A1);  // b = a > 0.0f ? a : 0.0f
-  A2 = (B > (real)0.0f).condition(B, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.reluDerivative(B);  // a *= (b > 0.0f ? 1.0f : 0.0f)
-  A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * b = a > p1 ? a : p1
-   * b = b < p2 ? b : p2
-   * int p1 = 0, p2 = 24;
-   */
-  SetTensorValue(B, 32.0f);
-  B.brelu(A1);
-  auto tmp = (B > (real)0.0f).condition(B, (real)0.0f);
-  A2 = (tmp < (real)24.0f).condition(tmp, (real)24.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  SetTensorValue(B, 32.0f);
-  /*
-   * a *= (b > p1 && b < p2) ? 1.0 : 0.0
-   * int p1 = 0, p2 = 24;
-   */
-  A1.breluDerivative(B);
-  A2 *= (B > (real)0.0f && B < (real)24.0f).condition((real)1.0f, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.absDerivative(B);  // a = (b > 0) ? a : (b < 0) ? -a : 0
-  A2 = (B > (real)0.0f)
-           .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 0.613;
-  SetTensorValue(B, p);
-  A1.isEqualTo(B, p);  // a = (b == p)
-  A2 = (B == p);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
-  /**
-   * T lambda = p * b;
-   * a = (a > lambda) ? (a - lambda)
-   *                  : (a < -lambda) ? (a + lambda) : 0
-   *
-   * p = learningRate * decayRate;
-   */
-  real learningRate = 0.7f;
-  real decayRate = 0.6f;
-  A1.applyL1(B, learningRate, decayRate);
-  auto lambda = B.constant(learningRate * decayRate) * B;
-  A2 = (A2 > lambda)
-           .condition((A2 - lambda),
-                      (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.subScalar(0.5f);
-  SetTensorValue(B, 0.0f);
-  testTensorReluDerivative(A1, A2, B);
-
-  A1.randomizeUniform();
-  A2.copyFrom(A1);
-  testTensorBreluDerivative(A1, A2, B);
-
-  testTensorAbsDerivative(A1, A2, B);
-  testTensorRelu(A1, A2, B);
-  testTensorBrelu(A1, A2, B);
-  testTensorIsEqualTo(A1, A2, B);
-}
-
-TEST(Binary, CompareOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.add(B, C);  // a = b + c
-  A2 = B + C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  real p3 = 3.8;
-  A1.add(B, p1, C, p2);  // a = p1 * b + p2 * c
-  A2 = B * p1 + C * p2;
-  TensorCheckEqual(A1, A2);
-
-  A1.add2(B, C);  // a = a + b + c
-  A2 = A2 + B + C;
-  TensorCheckEqual(A1, A2);
-
-  A1.add2(B, C, p1, p2, p3);  // a = p1 * a + p2 * b + p3 * c
-  A2 = A2 * p1 + B * p2 + C * p3;
-  TensorCheckEqual(A1, A2);
-
-  A1.decayAddSquareMul(B, C, p1, p2);  // a = p1 * a + p2 * b * b * c * c
-  A2 = A2 * p1 + B.constant(p2) * B * B * C * C;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.sub(B, C);  // a = b - c
-  A2 = B - C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  A1.sub(B, p1, C, p2);  // a = p1 * b - p2 * c
-  A2 = B * p1 - C * p2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.dotMul(B, C);  // a = b * c
-  A2 = B * C;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMulSquare(B, C);  // a = b * c * c
-  A2 = B * C * C;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotSquareSquare(B, C);  // a = b * b * c * c
-  A2 = B * B * C * C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-
-  /*
-   * T tmp = p1 * b + p2 * c;
-   * a *= tmp * tmp
-   */
-  A1.dotMulSquareSum(B, C, p1, p2);
-  auto tmp = B * p1 + C * p2;
-  A2 *= tmp * tmp;
-  TensorCheckEqual(A1, A2);
-
-  /*
-   * T tmp = p1 * b + p2 * c;
-   * a = tmp * tmp
-   */
-  A1.dotSquareSum(B, C, p1, p2);
-  auto tmp2 = B * p1 + C * p2;
-  A2 = tmp2 * tmp2;
-  TensorCheckEqual(A1, A2);
-
-  // a *= p1 * b + p2 * c
-  A1.dotMulSum(B, C, p1, p2);
-  A2 *= B * p1 + C * p2;
-  TensorCheckEqual(A1, A2);
-
-  // a = p1 * a + p2 * b * c
-  A1.addDotMul(B, C, p1, p2);
-  A2 = A2 * p1 + B.constant(p2) * B * C;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.dotDiv(B, C);  // a = (b == 0.0) ? 0.0 : b / c
-  A2 = (B == (real)0.0).condition((real)0.0, B / C);
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  A1.dotDiv(B, C, p1, p2);  // a = (b + p1) / (c + p2)
-  A2 = (B + p1) / (C + p2);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  real p1 = 1.5;
-  real p2 = 2.5;
-  real p3 = 3.5;
-  A1.reciprocalSum(B, C, p1, p2, p3);  // a = 1 / (p1 * b + p2 * c + p3)
-  A2 = (B * p1 + C * p2 + p3).reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.softCrossEntropy(B, C);  // a = -c * log(b) - (1 - c) * log(1 - b)
-  A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftCrossEntropyBp(Tensor& A1,
-                                  Tensor& A2,
-                                  Tensor& B,
-                                  Tensor& C) {
-  A1.softCrossEntropyBp(B, C);  // a += (b - c) / (b * (1 - b))
-  A2 += (B - C) / (B * (B.constant(1.0f) - B));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  testTensorAdd(A1, A2, B, C);
-  testTensorSub(A1, A2, B, C);
-  testTensorMul(A1, A2, B, C);
-  testTensorDiv(A1, A2, B, C);
-  testTensorReciprocal(A1, A2, B, C);
-  testTensorSoftCrossEntropyBp(A1, A2, B, C);
-
-  testTensorSoftCrossEntropy(A1, A2, B, C);
-}
-
-TEST(Ternary, BaseOp) {
-  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorBinaryLabelCrossEntropy(Tensor& A1,
-                                       Tensor& A2,
-                                       Tensor& B,
-                                       Tensor& C) {
-  A1.binaryLabelCrossEntropy(B, C);  // a = c > 0.5 ? -log(b) : -log(1.0 - b)
-  A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log()));
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
-                                         Tensor& A2,
-                                         Tensor& B,
-                                         Tensor& C) {
-  // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
-  A1.binaryLabelCrossEntropyBp(B, C);
-  A2 += (C > (real)0.5)
-            .condition((B.constant(-1.0f) / B),
-                       (B.constant(1.0f) - B).reciprocal());
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLogisticRegressionLoss(Tensor& A1,
-                                      Tensor& A2,
-                                      Tensor& B,
-                                      Tensor& C) {
-  SetTensorValue(B, 50.0f);
-  SetTensorValue(B, -50.0f);
-  /**
-   * const T THRESHOLD = 40.0;
-   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-   *                                        ? -THRESHOLD
-   *                                        : b;
-   * a = log(1 + exp(x)) - c * x
-   */
-  A1.logisticRegressionLoss(B, C);
-  real THRESHOLD = 40.0;
-  auto tmp =
-      (B > THRESHOLD)
-          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
-  A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLogisticRegressionLossBp(Tensor& A1,
-                                        Tensor& A2,
-                                        Tensor& B,
-                                        Tensor& C) {
-  SetTensorValue(B, 50.0f);
-  SetTensorValue(B, -50.0f);
-  /**
-   * const T THRESHOLD = 40.0;
-   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-   *                                        ? -THRESHOLD
-   *                                        : b;
-   * x = exp(x); a = x / (1 + x) - c
-   */
-  A1.logisticRegressionLossBp(B, C);
-  real THRESHOLD = 40.0;
-  auto tmp =
-      (B > THRESHOLD)
-          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
-  auto tmp2 = tmp.exp();
-  A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.biggerThan(B, C);  // a = (b > c) ? 1.0f : 0.0f
-  A2 = (B > C).condition((real)1.0f, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.max2(B, C);  // a = (b > c) ? b : c
-  A2 = (B > C).condition(B, C);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
-  testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
-  testTensorBiggerThan(A1, A2, B, C);
-  testTensorMax(A1, A2, B, C);
-
-  testTensorLogisticRegressionLoss(A1, A2, B, C);
-  testTensorLogisticRegressionLossBp(A1, A2, B, C);
-}
-
-TEST(Ternary, CompareOp) {
-  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testQuaternaryAdd(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f);  // a = p1 * b + p2 * c + p3 * d
-  // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
-  // TensorCheckEqual(A1, A2);
-
-  /*
-   * T tmp = p1 * b + p2 * c + p3 * d;
-   * a += tmp * tmp
-   */
-  real p1 = 1.5f;
-  real p2 = 2.5f;
-  real p3 = 3.5f;
-  A1.addSquareSum(B, C, D, p1, p2, p3);
-  auto tmp = B * p1 + C * p2 + D * p3;
-  A2 += tmp * tmp;
-  TensorCheckEqual(A1, A2);
-}
-
-TEST(Quaternary, BaseOp) {
-  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorBiggerThan(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
-  A1.biggerThan(B, C, D);
-  A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5))
-           .condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorRankLoss(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  /**
-   * const T THRESHOLD = 40.0; a = b - c;
-   * a = (a > THRESHOLD)
-   *         ? THRESHOLD
-   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-   * a = log(1 + exp(a)) - a * d
-   */
-  A1.rankLoss(B, C, D);
-
-  real THRESHOLD = 40.0;
-  auto tmp = B - C;
-  auto tmp2 =
-      (tmp > THRESHOLD)
-          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
-  A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
-
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorRankLossBp(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  /**
-   * const T THRESHOLD = 40.0; a = b - c;
-   * a = (a > THRESHOLD)
-   *         ? THRESHOLD
-   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-   * a = exp(a); a = (a / (1 + a) - d)
-   */
-  A1.rankLossBp(B, C, D);
-  real THRESHOLD = 40.0;
-  auto tmp = B - C;
-  auto tmp2 =
-      (tmp > THRESHOLD)
-          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
-  auto tmp3 = tmp2.exp();
-  A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
-
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testQuaternaryCompareOp(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  testTensorBiggerThan(A1, A2, B, C, D);
-  testTensorRankLoss(A1, A2, B, C, D);
-  testTensorRankLossBp(A1, A2, B, C, D);
-}
-
-TEST(Quaternary, CompareOp) {
-  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
-#endif
-}
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
deleted file mode 100644
index fb58d26734cab5d7d7bbbbe1cf8a920e4195b4bb..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ /dev/null
@@ -1,461 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "OriginalOptimizerApi.h"
-#include "PerfUtils.h"
-#include "TensorCheck.h"
-#include "paddle/math/TrainingAlgorithmOp.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-
-#ifndef PADDLE_TYPE_DOUBLE
-DEFINE_double(max_diff, 1e-5, "max diff allowed");
-#else
-DEFINE_double(max_diff, 1e-13, "max diff allowed");
-#endif
-
-class SetMaxDiff {
- public:
-  explicit SetMaxDiff(double max_diff) {
-    max_diff_ = FLAGS_max_diff;
-    FLAGS_max_diff = max_diff;
-  }
-  ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
-
- private:
-  double max_diff_;
-};
-
-#define COPY_VECTOR_TO_CPU(cpuVec, vector)               \
-  do {                                                   \
-    if (vector->useGpu()) {                              \
-      cpuVec = Vector::create(vector->getSize(), false); \
-      cpuVec->copyFrom(*vector);                         \
-    } else {                                             \
-      cpuVec = vector;                                   \
-    }                                                    \
-  } while (0)
-
-int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-
-  const real* data1 = vector1.getData();
-  const real* data2 = vector2.getData();
-  size_t size = vector1.getSize();
-  int count = 0;
-  for (size_t i = 0; i < size; i++) {
-    real a = data1[i];
-    real b = data2[i];
-    if (fabs(a - b) > FLAGS_max_diff) {
-      if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
-        count++;
-      }
-    }
-  }
-
-  return count;
-}
-
-int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
-  VectorPtr tmp1;
-  VectorPtr tmp2;
-  COPY_VECTOR_TO_CPU(tmp1, vector1);
-  COPY_VECTOR_TO_CPU(tmp2, vector2);
-  return VectorCheckErr(*tmp1, *tmp2);
-}
-
-#ifdef PADDLE_DISABLE_TIMER
-
-#define CHECK_VECTORPTR(vector1, vector2) \
-  EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
-
-#else
-
-#define CHECK_VECTORPTR(vector1, vector2)
-
-#endif
-
-typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
-
-void testCase(testMatrixFunc matrixFunc) {
-#ifdef PADDLE_WITH_CUDA
-  for (auto useGpu : {false, true}) {
-#else
-  for (auto useGpu : {false}) {
-#endif
-    for (auto size : {1,
-                      32,
-                      64,
-                      128,
-                      512,
-                      1024,
-                      4096,
-                      32768,
-                      65536,
-                      131072,
-                      262144,
-                      524288,
-                      1048576,
-                      2097152}) {
-      LOG(INFO) << " size=" << size << " useGpu=" << useGpu;
-      matrixFunc(size, useGpu);
-    }
-  }
-}
-
-#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \
-  vec1[type] = Vector::create(size, useGpu);        \
-  vec2[type] = Vector::create(size, useGpu);        \
-  vec1[type]->rand();                               \
-  vec2[type]->copyFrom(*vec1[type]);
-
-void testAdagrad(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(
-      bufs1, epsilon, learningRate, momentum, decayRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(adagradApply(value,
-                                      grad,
-                                      mom,
-                                      accum_buffer,
-                                      accum,
-                                      lr,
-                                      epsilon,
-                                      learningRate,
-                                      momentum,
-                                      decayRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, Adagrad) { testCase(testAdagrad); }
-
-void testAdaDelta(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(
-      bufs1, rou, epsilon, learningRate, momentum, decayRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(adadeltaApply(value,
-                                       grad,
-                                       mom,
-                                       accum,
-                                       accum_update,
-                                       lr,
-                                       rou,
-                                       epsilon,
-                                       learningRate,
-                                       momentum,
-                                       decayRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, AdaDelta) { testCase(testAdaDelta); }
-
-template <bool isFirstTime>
-void testRMSProp(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  /* make sure 'g - f.square()' greater than 0 */
-  bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0);
-  bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom(
-      *bufs1[PARAMETER_GRADIENT_SQURESUM]);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-  real accumulatedRou = rou;
-
-  EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1,
-                                                   accumulatedRou,
-                                                   rou,
-                                                   epsilon,
-                                                   learningRate,
-                                                   momentum,
-                                                   decayRate,
-                                                   isFirstTime));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(rmspropApply(value,
-                                      grad,
-                                      mom,
-                                      sum,
-                                      sum1,
-                                      lr,
-                                      accumulatedRou,
-                                      rou,
-                                      epsilon,
-                                      learningRate,
-                                      momentum,
-                                      decayRate,
-                                      isFirstTime));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, RMSProp) {
-  testCase(testRMSProp<true>);
-  testCase(testRMSProp<false>);
-}
-
-template <bool isFirstTime>
-void testDecayedAdagrad(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-  real accumulatedRou = rou;
-
-  if (isFirstTime) {
-    bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
-    bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
-  }
-
-  EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1,
-                                                          accumulatedRou,
-                                                          rou,
-                                                          epsilon,
-                                                          learningRate,
-                                                          momentum,
-                                                          decayRate,
-                                                          isFirstTime));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(decayedAdagradApply(value,
-                                             grad,
-                                             mom,
-                                             sum,
-                                             lr,
-                                             accumulatedRou,
-                                             rou,
-                                             epsilon,
-                                             learningRate,
-                                             momentum,
-                                             decayRate,
-                                             isFirstTime));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, DecayedAdagrad) {
-  testCase(testDecayedAdagrad<false>);
-  testCase(testDecayedAdagrad<true>);
-}
-
-void testAdam(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu);
-
-  real beta1 = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta2 = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta1_power = (real)rand() / (real)RAND_MAX;   // NOLINT
-  real beta2_power = (real)rand() / (real)RAND_MAX;   // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdamParameterOptimizer(
-      bufs1, beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM];
-
-  EXPRESSION_PERFORMANCE(adamApply(value,
-                                   grad,
-                                   mom,
-                                   v,
-                                   beta1,
-                                   beta2,
-                                   beta1_power,
-                                   beta2_power,
-                                   epsilon,
-                                   learningRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM],
-                  bufs2[PARAMETER_SECOND_MOMENTUM]);
-}
-
-TEST(Training, Adam) { testCase(testAdam); }
-
-void testAdamax(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu);
-
-  real beta1 = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real beta2 = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real alpha = (real)rand() / (real)RAND_MAX;  // NOLINT
-  int64_t step = 2;
-
-  EXPRESSION_PERFORMANCE(
-      AdamaxParameterOptimizer(bufs1, beta1, beta2, step, alpha));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM];
-
-  EXPRESSION_PERFORMANCE(
-      adamaxApply(value, grad, mom, u, beta1, beta2, step, alpha));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM],
-                  bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]);
-}
-
-TEST(Training, Adamax) {
-#ifndef PADDLE_TYPE_DOUBLE
-  SetMaxDiff diff(1e-4);
-#endif
-  testCase(testAdamax);
-}
-
-void testSparseMomentum(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu);
-
-  real alpha = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta = (real)rand() / (real)RAND_MAX;          // NOLINT
-  real gamma = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real tau = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-
-  EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(
-      bufs1, alpha, beta, gamma, tau, learningRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT];
-  BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT];
-
-  EXPRESSION_PERFORMANCE(sparseMomentumApply(
-      value, grad, momU, momV, alpha, beta, gamma, tau, learningRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT], bufs2[PARAMETER_MOMENTUM_UT]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT], bufs2[PARAMETER_MOMENTUM_VT]);
-}
-
-TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu
deleted file mode 100644
index cbd74bbfe33270f351632b58d7e89f8e60d15b83..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_lazyAssign.cu
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "PerfUtils.h"
-#include "TensorCheck.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/TensorAssign.h"
-
-using paddle::BaseMatrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-typedef std::function<void(int height, int width)> testMatrixFunc;
-void testMatrixCase(testMatrixFunc matrixFunc) {
-  for (auto height : {1}) {
-    for (auto width : {1,
-                       32,
-                       64,
-                       128,
-                       512,
-                       1024,
-                       4096,
-                       32768,
-                       65536,
-                       131072,
-                       262144,
-                       524288,
-                       1048576,
-                       2097152,
-                       4194304,
-                       8388608}) {
-      matrixFunc(height, width);
-    }
-  }
-}
-
-template <typename Tensor>
-void testLazyAssign(int height, int width) {
-  Tensor A1(height, width);
-  Tensor A2(height, width);
-  Tensor B(height, width);
-  Tensor C(height, width);
-  Tensor D(height, width);
-  A1.randomizeUniform();
-  B.randomizeUniform();
-  C.randomizeUniform();
-  D.randomizeUniform();
-  A2.copyFrom(A1);
-
-  EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
-
-  EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C);
-                         auto expr2 = A2.lazyAssign(A2 * D);
-                         AssignEvaluate(expr1, expr2););
-
-  TensorCheckErr(A1, A2);
-}
-
-TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
-
-#ifdef PADDLE_WITH_GPU
-TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
-#endif
-
-template <typename Tensor>
-void sgdUpdateTensor(
-    Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) {
-  C = C * p2 - D * (B + A * p3) * p1;
-  A += C;
-}
-
-void sgdUpdateLazyAssign(BaseMatrix& A,
-                         BaseMatrix& B,
-                         BaseMatrix& C,
-                         BaseMatrix& D,
-                         real p1,
-                         real p2,
-                         real p3) {
-  auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
-  auto expr2 = A.lazyAssign(A + C);
-  AssignEvaluate(expr1, expr2);
-}
-
-template <typename Tensor>
-void testSgdUpdate(int height, int width) {
-  Tensor A1(height, width);
-  Tensor A2(height, width);
-  Tensor A3(height, width);
-  A1.randomizeUniform();
-  A2.copyFrom(A1);
-  A3.copyFrom(A1);
-
-  Tensor B(height, width);
-  B.randomizeUniform();
-
-  Tensor C1(height, width);
-  Tensor C2(height, width);
-  Tensor C3(height, width);
-  C1.randomizeUniform();
-  C2.copyFrom(C1);
-  C3.copyFrom(C1);
-
-  Tensor D(height, width);
-  D.randomizeUniform();
-
-  real p1 = 0.2;
-  real p2 = 0.3;
-  real p3 = 0.5;
-
-  /**
-   * c = p2 * c - p1 * (b + p3 * a);
-   * a = a + c;
-   */
-  // BaseMatrix API
-  EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3););
-
-  // Tensor expression
-  EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
-
-  // lazyAssign
-  EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
-
-  TensorCheckErr(A1, A2);
-  TensorCheckErr(A1, A3);
-  TensorCheckErr(C1, C2);
-  TensorCheckErr(C1, C3);
-}
-
-TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
-
-#ifdef PADDLE_WITH_GPU
-TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
-#endif
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
deleted file mode 100644
index e45ddd433faf18dbcd647b305db3a36d38c90825..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ /dev/null
@@ -1,1698 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
-/// only cpu version.
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/testing/TestUtil.h"
-#include "paddle/utils/DynamicLoader.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-void testMatrixMaxSequence(int batchSize, int inputDim) {
-  // forward
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  int newBatchSize = cpuSequence->getSize() - 1;
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutput->zero();
-  gpuOutput->zero();
-
-  IVectorPtr cpuIndex = nullptr;
-  IVectorPtr gpuIndex = nullptr;
-  IVector::resizeOrCreate(cpuIndex, newBatchSize * inputDim, false);
-  IVector::resizeOrCreate(gpuIndex, newBatchSize * inputDim, true);
-  cpuIndex->zeroMem();
-  gpuIndex->zeroMem();
-
-  cpuOutput->maxSequenceForward(*cpuInput, *cpuSequence, *cpuIndex);
-  gpuOutput->maxSequenceForward(*gpuInput, *gpuSequence, *gpuIndex);
-
-  TensorCheckEqual(*cpuOutput, *gpuOutput);
-  TensorCheckEqual(*cpuIndex, *gpuIndex);
-
-  // backward
-  MatrixPtr cpuOutputGrad = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutputGrad = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutputGrad->randomizeUniform();
-  gpuOutputGrad->copyFrom(*cpuOutputGrad);
-
-  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInputGrad->randomizeUniform();
-  gpuInputGrad->copyFrom(*cpuInputGrad);
-
-  cpuInputGrad->maxSequenceBackward(*cpuOutputGrad, *cpuSequence, *cpuIndex);
-  gpuInputGrad->maxSequenceBackward(*gpuOutputGrad, *gpuSequence, *gpuIndex);
-
-  TensorCheckEqual(*cpuInputGrad, *gpuInputGrad);
-}
-
-TEST(Matrix, maxSequence) {
-  for (auto batchSize : {1, 3, 997}) {   // prime numbers close to 1, 4, 1024
-    for (auto inputDim : {1, 7, 131}) {  // prime numbers close to 1, 8, 128
-      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
-      testMatrixMaxSequence(batchSize, inputDim);
-    }
-  }
-}
-
-void testMatrixGetSum(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  int x = log10(height * width);
-  real err = 1e-6 * pow(10, x);
-#else
-  real err = 1e-8;
-#endif
-
-  real cpuSum = cpuInput->getSum();
-  real gpuSum = gpuInput->getSum();
-
-  EXPECT_LE(fabs(cpuSum - gpuSum), err);
-}
-
-void testMatrixGetMinMax(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  real cpuMin = cpuInput->getMin();
-  real gpuMin = gpuInput->getMin();
-  real cpuMax = cpuInput->getMax();
-  real gpuMax = gpuInput->getMax();
-
-  EXPECT_EQ(cpuMin, gpuMin);
-  EXPECT_EQ(cpuMax, gpuMax);
-}
-
-void testMatrixZeroAtOffset(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuTest = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  cpuTest->copyFrom(*cpuA);
-
-  int columnOffset = rand() % width;  // NOLINT we just use rand() for test.
-  int numColumns = rand() % (width - columnOffset);  // NOLINT
-
-  if (numColumns == 0) return;
-
-  cpuA->zeroAtOffset(columnOffset, numColumns);
-  gpuA->zeroAtOffset(columnOffset, numColumns);
-
-  /* cpuTest */
-  real* a = cpuTest->getData() + columnOffset;
-  for (int64_t i = 0; i < height; ++i) {
-    for (int64_t j = 0; j < numColumns; ++j) {
-      a[i * width + j] = 0;
-    }
-  }
-
-  TensorCheckEqual(*cpuA, *gpuA);
-  TensorCheckEqual(*cpuA, *cpuTest);
-}
-
-void testMatrixDeepSwap(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuCopyA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuCopyB = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuCopyA->copyFrom(*cpuA);
-  cpuCopyB->copyFrom(*cpuB);
-
-  // swap matrix cpuA and cpuB
-  cpuA->deepSwap(*cpuB);
-
-  TensorCheckEqual(*cpuA, *cpuCopyB);
-  TensorCheckEqual(*cpuB, *cpuCopyA);
-}
-
-void testMatrixTranspose(int height, int width) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuT = std::make_shared<CpuMatrix>(width, height);
-  MatrixPtr gpuT = std::make_shared<GpuMatrix>(width, height);
-
-  cpu->randomizeUniform();
-  gpu->copyFrom(*cpu);
-  cpu->transpose(cpuT, false);
-  gpu->transpose(gpuT, true);
-
-  TensorCheckEqual(*cpuT, *gpuT);
-}
-
-void testMatrixRotate(int height, int width) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuR = std::make_shared<CpuMatrix>(width, height);
-  MatrixPtr gpuR = std::make_shared<GpuMatrix>(width, height);
-
-  cpu->randomizeUniform();
-  gpu->copyFrom(*cpu);
-
-  cpu->rotate(cpuR, false, true);
-  gpu->rotate(gpuR, true, true);
-  TensorCheckEqual(*cpuR, *gpuR);
-
-  cpu->rotate(cpuR, true, false);
-  gpu->rotate(gpuR, false, false);
-  TensorCheckEqual(*cpuR, *gpuR);
-}
-
-void testMatrixInverse(int height) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
-  MatrixPtr cpuI = std::make_shared<CpuMatrix>(height, height);
-  MatrixPtr gpuI = std::make_shared<GpuMatrix>(height, height);
-
-  /* Make matrix well conditioned: cpu * cpuT + Identity */
-  cpu->randomizeUniform();
-  MatrixPtr cpuT = cpu->getTranspose();
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, height);
-  outputCheck->mul(*cpu, *cpuT);
-  cpu->setDiag(1.0);
-  cpu->add(*outputCheck);
-
-  gpu->copyFrom(*cpu);
-  cpu->inverse(cpuI, true);
-  gpu->inverse(gpuI, false);
-
-  TensorCheckErr(*cpuI, *gpuI);
-
-  outputCheck->mul(*cpu, *cpuI);
-  cpu->setDiag(1.0);
-  TensorCheckErr(*cpu, *outputCheck);
-}
-
-TEST(Matrix, unary) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      testMatrixDeepSwap(height, width);
-      testMatrixZeroAtOffset(height, width);
-      testMatrixGetSum(height, width);
-      testMatrixTranspose(height, width);
-      testMatrixRotate(height, width);
-    }
-#ifdef LAPACK_FOUND
-    // inverse matrix
-    testMatrixInverse(height);
-#else
-    LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
-                 << "support so we cannot test matrix inverse. To test "
-                 << "matrix inverse, please install LAPACKE "
-                 << "and MKL/Openblas, and re-build PaddlePaddle.";
-#endif
-  }
-}
-
-void testMatrixSoftmax(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-}
-
-void testSequenceSoftmax(int batchSize) {
-  // forward
-  int inputDim = 1;
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
-  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
-
-  TensorCheckErr(*cpuInput, *gpuInput);
-}
-
-void testMatrixSoftmaxThreshold(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  cpuInput->getData()[0] = 100.0;
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  // check output zero
-  int cpuCount = 0;
-  int gpuCount = 0;
-  auto zeroNum = [](MatrixPtr out, int& count) {
-    for (size_t i = 0; i < out->getHeight(); i++) {
-      for (size_t j = 0; j < out->getWidth(); j++) {
-        if (out->getElement(i, j) == 0) count++;
-      }
-    }
-  };
-  zeroNum(cpuOutput, cpuCount);
-  zeroNum(outputCheck, gpuCount);
-  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
-  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
-}
-
-void testMatrixSoftmaxBp(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-  gpuOutput->softmaxBackward(*gpuInput);
-
-  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
-  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
-  sftMaxSum->colMerge(*sftMaxDot);
-  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-}
-
-TEST(Matrix, softmax) {
-  for (auto height : {1, 3, 131}) {    // prime numbers close to 1, 4, 127
-    for (auto width : {1, 17, 251}) {  // prime numbers close to 1, 16, 256
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      testMatrixSoftmax(height, width);
-      testMatrixSoftmaxBp(height, width);
-      testMatrixSoftmaxThreshold(height, width);
-    }
-    testSequenceSoftmax(height);
-  }
-}
-
-void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
-  MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
-  MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
-  cpuTable->randomizeUniform();
-  gpuTable->copyFrom(*cpuTable);
-
-  IVectorPtr cpuIds;
-  IVectorPtr gpuIds;
-  cpuIds = VectorT<int>::create(numSamples, false);
-  gpuIds = VectorT<int>::create(numSamples, true);
-  cpuIds->rand(tableSize);
-  gpuIds->copyFrom(*cpuIds);
-
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, inputDim);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  cpuOutput->addToRows(*cpuTable, *cpuIds);
-  gpuOutput->addToRows(*gpuTable, *gpuIds);
-
-  TensorCheckErr(*cpuTable, *gpuTable);
-}
-
-TEST(Matrix, tableProjection) {
-  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
-    for (auto tableSize : {10, 100}) {
-      for (auto inputDim : {20, 50}) {
-        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
-                << " inputDim=" << inputDim;
-        testMatrixAddToRows(numSamples, tableSize, inputDim);
-      }
-    }
-  }
-}
-
-void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
-  int heightA = transa == false ? dimM : dimK;
-  int widthA = transa == false ? dimK : dimM;
-  int heightB = transb == false ? dimK : dimN;
-  int widthB = transb == false ? dimN : dimK;
-  int heightC = dimM;
-  int widthC = dimN;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
-
-  real alpha = 1.5;
-  real beta = 2.0;
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  cpuC->mul(*cpuA, *cpuB, alpha, beta);
-  gpuC->mul(*gpuA, *gpuB, alpha, beta);
-
-  TensorCheckErr(*cpuC, *gpuC);
-}
-
-void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
-  int heightA = transa == false ? dimM : dimK;
-  int widthA = transa == false ? dimK : dimM;
-  int heightB = transb == false ? dimK : dimN;
-  int widthB = transb == false ? dimN : dimK;
-  int heightC = dimM;
-  int widthC = dimN;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
-
-  real alpha = 1.5;
-  real beta = 2.0;
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  auto subSize = [](int& start, int& end, int dim) {
-    if (dim == 1) {
-      start = 0;
-      end = dim;
-    } else {
-      int subDim = rand() % (dim - 1) + 1;  // NOLINT
-      start = rand() % (dim - subDim);      // NOLINT
-      end = start + subDim;
-    }
-  };
-
-  auto subMatrix = [](MatrixPtr& sub,
-                      MatrixPtr matrix,
-                      size_t startRow,
-                      size_t endRow,
-                      size_t startCol,
-                      size_t endCol) {
-    if (!matrix->isTransposed()) {
-      sub = matrix->subMatrix(startRow, endRow, startCol, endCol);
-    } else {
-      sub = matrix->subMatrix(startCol, endCol, startRow, endRow);
-    }
-  };
-
-  int startM, endM;
-  int startN, endN;
-  int startK, endK;
-  subSize(startM, endM, dimM);
-  subSize(startN, endN, dimN);
-  subSize(startK, endK, dimK);
-
-  MatrixPtr subCpuA;
-  MatrixPtr subCpuB;
-  MatrixPtr subGpuA;
-  MatrixPtr subGpuB;
-  subMatrix(subCpuA, cpuA, startM, endM, startK, endK);
-  subMatrix(subGpuA, gpuA, startM, endM, startK, endK);
-  subMatrix(subCpuB, cpuB, startK, endK, startN, endN);
-  subMatrix(subGpuB, gpuB, startK, endK, startN, endN);
-  MatrixPtr subCpuC = cpuC->subMatrix(startM, endM, startN, endN);
-  MatrixPtr subGpuC = gpuC->subMatrix(startM, endM, startN, endN);
-
-  subCpuC->mul(*subCpuA, *subCpuB, alpha, beta);
-  subGpuC->mul(*subGpuA, *subGpuB, alpha, beta);
-
-  TensorCheckErr(*cpuC, *gpuC);
-}
-
-TEST(Matrix, mul) {
-  for (auto transa : {false, true}) {
-    for (auto transb : {false, true}) {
-      for (auto dimM : {1, 9, 53, 127, 345, 1023, 2135}) {
-        for (auto dimN : {1, 5, 37, 256, 1024}) {
-          for (auto dimK : {8, 45, 346, 784, 1025}) {
-            if (true == transa && true == transb) {
-              continue;
-            }
-            VLOG(3) << setiosflags(ios::left) << setfill(' ')
-                    << " transa=" << transa << " transb=" << transb
-                    << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
-                    << dimN << " dimK=" << setw(5) << dimK;
-
-            testMatrixMul(transa, transb, dimM, dimN, dimK);
-            testSubMatrixMul(transa, transb, dimM, dimN, dimK);
-          }
-        }
-      }
-    }
-  }
-}
-
-void testVectorRowFunc(int size) {
-  CpuVectorPtr cpu = std::make_shared<CpuVectorT<real>>(size);
-  GpuVectorPtr gpu = std::make_shared<GpuVectorT<real>>(size);
-
-  cpu->rand();
-  gpu->copyFrom(*cpu);
-
-  EXPECT_EQ(cpu->getMax(), gpu->getMax());
-  EXPECT_EQ(cpu->getMin(), gpu->getMin());
-  EXPECT_EQ(cpu->getAbsMax(), gpu->getAbsMax());
-}
-
-TEST(Vector, rowFunc) {
-  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
-    VLOG(3) << " size=" << size;
-    testVectorRowFunc(size);
-  }
-}
-
-template <class T>
-void testVectorReset(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
-
-  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
-  cpu->reset(value);
-  gpu->reset(value);
-
-  TensorCheckEqual(*cpu, *gpu);
-}
-
-template <class T>
-void testVecortSelectFrom(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpuDst = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuDst = std::make_shared<GpuVectorT<T>>(size);
-  std::shared_ptr<CpuVectorT<T>> cpuSrc =
-      std::make_shared<CpuVectorT<T>>(size * 2);
-  std::shared_ptr<GpuVectorT<T>> gpuSrc =
-      std::make_shared<GpuVectorT<T>>(size * 2);
-  CpuIVectorPtr cpuIds = std::make_shared<CpuVectorT<int>>(size);
-  GpuIVectorPtr gpuIds = std::make_shared<GpuVectorT<int>>(size);
-
-  if (std::is_same<T, real>::value) {
-    cpuSrc->rand();
-  } else {
-    cpuSrc->rand(100000);
-  }
-  gpuSrc->copyFrom(*cpuSrc);
-  cpuIds->rand(size);
-  gpuIds->copyFrom(*cpuIds);
-
-  cpuDst->selectFrom(*cpuSrc, *cpuIds);
-  gpuDst->selectFrom(*gpuSrc, *gpuIds);
-
-  TensorCheckEqual(*cpuDst, *gpuDst);
-}
-
-template <class T>
-void testVecotrZeroMem(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
-
-  cpu->zeroMem();
-  gpu->zeroMem();
-
-  TensorCheckEqual(*cpu, *gpu);
-}
-
-template <class T>
-void testVectorIsEqual(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpuA = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<CpuVectorT<T>> cpuB = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuA = std::make_shared<GpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuB = std::make_shared<GpuVectorT<T>>(size);
-
-  if (std::is_same<T, real>::value) {
-    cpuB->rand();
-  } else {
-    cpuB->rand(100000);
-  }
-  gpuB->copyFrom(*cpuB);
-
-  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
-  cpuA->isEqualTo(*cpuB, value);
-  gpuA->isEqualTo(*gpuB, value);
-
-  TensorCheckEqual(*cpuA, *gpuA);
-}
-
-TEST(Vector, Equal) {
-  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
-    VLOG(3) << " size=" << size;
-    testVectorReset<int>(size);
-    testVectorReset<real>(size);
-    testVecortSelectFrom<int>(size);
-    testVecortSelectFrom<real>(size);
-    testVecotrZeroMem<int>(size);
-    testVecotrZeroMem<real>(size);
-    testVectorIsEqual<int>(size);
-    testVectorIsEqual<real>(size);
-  }
-}
-
-void testMatrixTopK(int samples, int dim, int beamSize) {
-  MatrixPtr cpuSrc = std::make_shared<CpuMatrix>(samples, dim);
-  MatrixPtr gpuSrc = std::make_shared<GpuMatrix>(samples, dim);
-  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
-  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
-  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
-
-  cpuSrc->randomizeUniform();
-  gpuSrc->copyFrom(*cpuSrc);
-
-  cpuSrc->rowMax(*cpuIds, *cpuVal);
-  gpuSrc->rowMax(*gpuIds, *gpuVal);
-
-  TensorCheckEqual(*cpuVal, *gpuVal);
-}
-
-TEST(Matrix, topK) {
-  for (auto samples : {1, 17, 131}) {  // prime numbers close to 1, 16, 127
-    for (auto dim : {1, 3, 997}) {     // prime numbers close to 1, 4, 1024
-      for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
-        if (beamSize > dim) continue;
-        VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
-                << " dim=" << dim;
-        testMatrixTopK(samples, dim, beamSize);
-      }
-    }
-  }
-}
-
-void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
-  int nnz = samples * dim * ratio;
-  if (nnz < 1) nnz = 1;  // Because sparseRand in MathUtil.cpp requires this.
-  MatrixPtr cpuSrc = std::make_shared<CpuSparseMatrix>(samples, dim, nnz);
-  MatrixPtr gpuSrc = std::make_shared<GpuSparseMatrix>(samples, dim, nnz);
-  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
-  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
-  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
-
-  cpuSrc->randomizeUniform();
-  gpuSrc->copyFrom(*cpuSrc);
-  cpuVal->zero();
-  cpuIds->zero();
-  gpuVal->zero();
-  gpuIds->zero();
-
-  cpuSrc->rowMax(*cpuIds, *cpuVal);
-  gpuSrc->rowMax(*gpuIds, *gpuVal);
-
-  TensorCheckEqual(*cpuVal, *gpuVal);
-
-  IVectorPtr outCheckIds = std::make_shared<CpuIVector>(samples * beamSize);
-  outCheckIds->copyFrom(*gpuIds);
-
-  const int* data1 = cpuIds->getData();
-  const int* data2 = outCheckIds->getData();
-  size_t size = cpuIds->getSize();
-  for (size_t i = 0; i < size; i++) {
-    if (data1[i] == -1 && data1[i] != data2[i]) {
-      EXPECT_EQ(data1[i], data2[i]);
-    }
-  }
-}
-
-TEST(SMatrix, topK) {
-  for (auto samples : {1, 3, 61}) {
-    for (auto dim : {1, 3, 61}) {
-      for (auto beamSize : {1, 3, 61}) {
-        for (auto ratio : {0.01, 0.001}) {
-          if (beamSize > dim) continue;
-          VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
-                  << " dim=" << dim << " ratio=" << ratio;
-          testSMatrixTopK(samples, dim, beamSize, ratio);
-        }
-      }
-    }
-  }
-}
-
-void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  int newBatchSize = cpuSequence->getSize() - 1;
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutput->zero();
-  gpuOutput->zero();
-
-  cpuOutput->sequenceAvgForward(*cpuInput, *cpuSequence, mode);
-  gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-
-  MatrixPtr cpuInGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInGrad->randomizeUniform();
-  gpuInGrad->copyFrom(*cpuInGrad);
-
-  cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode);
-  gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode);
-
-  TensorCheckErr(*cpuInGrad, *gpuInGrad);
-}
-
-TEST(Matrix, sequenceAvg) {
-  for (auto batchSize : {10, 128, 6000}) {
-    for (auto inputDim : {32, 100, 512}) {
-      for (auto mode : {0, 1, 2}) {
-        VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
-                << " mode=" << mode;
-        testMatrixSequenceAvg(batchSize, inputDim, mode);
-      }
-    }
-  }
-}
-
-void testParamReluBackwardDiff(int height,
-                               int width,
-                               int w_height,
-                               int w_width) {
-  MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
-  MatrixPtr input = CpuMatrix::create(height, width, false, false);
-  MatrixPtr diff = CpuMatrix::create(height, width, false, false);
-  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
-
-  oGrad->randomizeUniform();
-  input->randomizeUniform();
-  w->randomizeUniform();
-  diff->randomizeUniform();
-  input->add(-0.5);
-
-  MatrixPtr oGradGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr diffGpu = CpuMatrix::create(height, width, false, true);
-  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
-
-  oGradGpu->copyFrom(*oGrad);
-  inputGpu->copyFrom(*input);
-  wGpu->copyFrom(*w);
-  diffGpu->copyFrom(*diff);
-
-  diff->paramReluBackwardDiff(*oGrad, *input, *w);
-  diffGpu->paramReluBackwardDiff(*oGradGpu, *inputGpu, *wGpu);
-
-  TensorCheckErr(*diff, *diffGpu);
-}
-
-TEST(Matrix, paramReluBackwardDiff) {
-  for (auto height : {10, 40, 100}) {
-    for (auto width : {10, 40, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          if (width % (w_height * w_width)) continue;
-          testParamReluBackwardDiff(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
-void testClassificationError(int numSamples, int dim, int topkSize) {
-  MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
-  MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
-  IVectorPtr cpuLabel = std::make_shared<CpuIVector>(numSamples);
-  IVectorPtr gpuLabel = std::make_shared<GpuIVector>(numSamples);
-
-  cpuOutput->randomizeUniform();
-  cpuLabel->rand(dim);
-  gpuOutput->copyFrom(*cpuOutput);
-  gpuLabel->copyFrom(*cpuLabel);
-
-  cpuError->classificationError(*cpuOutput, *cpuLabel, topkSize);
-  gpuError->classificationError(*gpuOutput, *gpuLabel, topkSize);
-
-  TensorCheckEqual(*cpuError, *gpuError);
-}
-
-TEST(Matrix, classificationError) {
-  for (auto numSamples : {1, 3, 31}) {
-    for (auto dim : {1, 3, 31}) {
-      for (auto topkSize : {1, 3, (int)rand() % dim + 1}) {
-        if (topkSize > dim) continue;
-        VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
-                << " dim= " << dim;
-        testClassificationError(numSamples, dim, topkSize);
-      }
-    }
-  }
-}
-
-void testMaxPoolFwdBwd(int numSamples,
-                       int channels,
-                       int imgSizeH,
-                       int imgSizeW,
-                       int ksizeH,
-                       int ksizeW,
-                       int strideH,
-                       int strideW,
-                       int padH,
-                       int padW) {
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->maxPoolForward(*input,
-                         imgSizeH,
-                         imgSizeW,
-                         channels,
-                         ksizeW,
-                         ksizeH,
-                         strideH,
-                         strideW,
-                         outH,
-                         outW,
-                         padH,
-                         padW);
-  targetGpu->maxPoolForward(*inputGpu,
-                            imgSizeH,
-                            imgSizeW,
-                            channels,
-                            ksizeW,
-                            ksizeH,
-                            strideH,
-                            strideW,
-                            outH,
-                            outW,
-                            padH,
-                            padW);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-  targetCheck->copyFrom(*targetGpu);
-  checkMatrixEqual(target, targetCheck);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxPoolBackward(*input,
-                             imgSizeH,
-                             imgSizeW,
-                             *targetGrad,
-                             *target,
-                             ksizeW,
-                             ksizeH,
-                             strideH,
-                             strideW,
-                             outH,
-                             outW,
-                             1.0,
-                             1.0,
-                             padH,
-                             padW);
-  inputGpuGrad->maxPoolBackward(*inputGpu,
-                                imgSizeH,
-                                imgSizeW,
-                                *targetGpuGrad,
-                                *targetGpu,
-                                ksizeW,
-                                ksizeH,
-                                strideH,
-                                strideW,
-                                outH,
-                                outW,
-                                1.0,
-                                1.0,
-                                padH,
-                                padW);
-  MatrixPtr targetBwdCheck =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-  targetBwdCheck->copyFrom(*inputGpuGrad);
-  checkMatrixEqual(inputGrad, targetBwdCheck);
-}
-
-void testAvgPoolFwdBwd(int numSamples,
-                       int channels,
-                       int imgSizeH,
-                       int imgSizeW,
-                       int ksizeH,
-                       int ksizeW,
-                       int strideH,
-                       int strideW,
-                       int padH,
-                       int padW) {
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->avgPoolForward(*input,
-                         imgSizeH,
-                         imgSizeW,
-                         channels,
-                         ksizeW,
-                         ksizeH,
-                         strideH,
-                         strideW,
-                         outH,
-                         outW,
-                         padH,
-                         padW);
-  targetGpu->avgPoolForward(*inputGpu,
-                            imgSizeH,
-                            imgSizeW,
-                            channels,
-                            ksizeW,
-                            ksizeH,
-                            strideH,
-                            strideW,
-                            outH,
-                            outW,
-                            padH,
-                            padW);
-
-  TensorCheckErr(*target, *targetGpu);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->avgPoolBackward(*targetGrad,
-                             imgSizeH,
-                             imgSizeW,
-                             ksizeW,
-                             ksizeH,
-                             strideH,
-                             strideW,
-                             outH,
-                             outW,
-                             1.0,
-                             1.0,
-                             padH,
-                             padW);
-  inputGpuGrad->avgPoolBackward(*targetGpuGrad,
-                                imgSizeH,
-                                imgSizeW,
-                                ksizeW,
-                                ksizeH,
-                                strideH,
-                                strideW,
-                                outH,
-                                outW,
-                                1.0,
-                                1.0,
-                                padH,
-                                padW);
-
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-// TODO(yi): I noticed many such blindly combinatorial tests in this
-// file.  They are no help to locate defects at all.
-TEST(Matrix, PoolFwdBwd) {
-  for (auto numSamples : {1, 3}) {
-    for (auto channels : {1, 3}) {
-      for (auto imgSizeH : {13, 17}) {
-        for (auto imgSizeW : {17, 19}) {
-          for (auto sizeX : {2, 3}) {
-            for (auto sizeY : {2, 3}) {
-              for (auto sH : {1, 2}) {
-                for (auto sW : {1, 2}) {
-                  for (auto pH : {0, (sizeY - 1) / 2}) {
-                    for (auto pW : {0, (sizeX - 1) / 2}) {
-                      VLOG(3) << " numSamples=" << numSamples
-                              << " channels=" << channels
-                              << " imgSizeH=" << imgSizeH
-                              << " imgSizeW=" << imgSizeW << " sizeX=" << sizeX
-                              << " sizeY=" << sizeY << " strideH=" << sH
-                              << " strideW=" << sW << " padingH=" << pH
-                              << " padingW=" << pW;
-                      testMaxPoolFwdBwd(numSamples,
-                                        channels,
-                                        imgSizeH,
-                                        imgSizeW,
-                                        sizeX,
-                                        sizeY,
-                                        sH,
-                                        sW,
-                                        pH,
-                                        pW);
-                      testAvgPoolFwdBwd(numSamples,
-                                        channels,
-                                        imgSizeH,
-                                        imgSizeW,
-                                        sizeX,
-                                        sizeY,
-                                        sH,
-                                        sW,
-                                        pH,
-                                        pW);
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void testMaxOutFwdBwd(
-    int numSamples, int imgSizeH, int imgSizeW, int channels, int groups) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outChannels = channels / groups;
-  int outWidth = imgSizeH * imgSizeW * outChannels;
-
-  // forward
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
-  IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
-
-  input->randomizeUniform();
-  inputGpu->copyFrom(*input);
-
-  target->maxoutForward(*input, *id, outChannels, groups);
-  targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
-
-  TensorCheckErr(*target, *targetGpu);
-  TensorCheckEqual(*id, *idGpu);
-
-  // backward
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
-  inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
-
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-TEST(Matrix, MaxOutFwdBwd) {
-  for (auto numSamples : {5, 10}) {
-    for (auto channels : {8, 16}) {
-      for (auto imgSizeH : {14, 28}) {
-        for (auto imgSizeW : {16, 30}) {
-          for (auto groups : {2, 4}) {
-            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
-                    << " groups=" << groups;
-            testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(CpuMatrix, copyFrom) {
-  const size_t height = 31;
-  const size_t width = 53;
-  CpuMatrix cpu(height, width);
-  GpuMatrix gpu(height, width);
-  CpuMatrix copy(height, width);
-
-  cpu.randomizeUniform();
-  gpu.copyFrom(cpu);
-  copy.copyFrom(gpu, HPPL_STREAM_DEFAULT);
-
-  TensorCheckEqual(cpu, copy);
-}
-
-void testBatch2seqPadding(int batchSize, int inputDim) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  for (int i = 0; i < int(cpuSequence->getSize()); ++i) {
-    (cpuSequence->getData())[i] += 1;  // so no way that maxSeqLen is 0;
-  }
-
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  size_t numSeq = cpuSequence->getSize() - 1;
-  size_t maxSeqLen = *std::max_element(cpuSequence->getData(),
-                                       cpuSequence->getData() + numSeq);
-
-  printf("numSeq = %ld, maxSeqLen = %ld\n", numSeq, maxSeqLen);
-  MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
-  MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim);
-  MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
-
-  // hl_sequence2batch_copy_padding(gBatch->getData(),
-  //                                gpuInput->getData(),
-  //                                cpuSequence->getData(),
-  //                                inputDim,
-  //                                maxSeqLen,
-  //                                numSeq,
-  //                                false,
-  //                                true);
-  // cCheck->copyFrom(*gBatch);
-
-  // int* seqStart = cpuSequence->getData();
-  // float* batchData = cBatch->getData();
-  // float* seqData = cpuInput->getData();
-  // for (size_t i = 0; i < maxSeqLen; i++) {
-  //   for (size_t j = 0; j < numSeq; j++) {
-  //     size_t sequenceStart = seqStart[j];
-  //     size_t sequenceLength = seqStart[j + 1] - seqStart[j];
-  //     if (i < sequenceLength) {
-  //       memcpy(batchData + (i * numSeq + j) * inputDim,
-  //              seqData + (sequenceStart + i) * inputDim,
-  //              inputDim * sizeof(real));
-  //     } else {
-  //       memset(batchData + (i * numSeq + j) * inputDim,
-  //              0,
-  //              inputDim * sizeof(real));
-  //     }
-  //   }
-  // }
-
-  // TensorCheckErr(*cBatch, *cCheck);
-}
-
-TEST(Matrix, warpCTC) {
-  for (auto batchSize : {1, 3, 17}) {
-    for (auto inputDim : {1, 3, 31}) {
-      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
-      testBatch2seqPadding(batchSize, inputDim);
-    }
-  }
-}
-
-void testMaxPool3DFwdBwd(int numSamples,
-                         int channels,
-                         int imgSizeD,
-                         int imgSizeH,
-                         int imgSizeW,
-                         int ksizeD,
-                         int ksizeH,
-                         int ksizeW,
-                         int strideD,
-                         int strideH,
-                         int strideW,
-                         int padD,
-                         int padH,
-                         int padW) {
-  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = channels * imgSizeD * imgSizeH * imgSizeW;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outD * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr maxIdx = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr maxIdxGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->maxPool3DForward(*input,
-                           *maxIdx,
-                           channels,
-                           imgSizeD,
-                           imgSizeH,
-                           imgSizeW,
-                           outD,
-                           outH,
-                           outW,
-                           ksizeD,
-                           ksizeH,
-                           ksizeW,
-                           strideD,
-                           strideH,
-                           strideW,
-                           padD,
-                           padH,
-                           padW);
-  targetGpu->maxPool3DForward(*inputGpu,
-                              *maxIdxGpu,
-                              channels,
-                              imgSizeD,
-                              imgSizeH,
-                              imgSizeW,
-                              outD,
-                              outH,
-                              outW,
-                              ksizeD,
-                              ksizeH,
-                              ksizeW,
-                              strideD,
-                              strideH,
-                              strideW,
-                              padD,
-                              padH,
-                              padW);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-  targetCheck->copyFrom(*targetGpu);
-  checkMatrixEqual(target, targetCheck);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxPool3DBackward(*targetGrad,
-                               *maxIdx,
-                               imgSizeD,
-                               imgSizeH,
-                               imgSizeW,
-                               outD,
-                               outH,
-                               outW,
-                               ksizeD,
-                               ksizeH,
-                               ksizeW,
-                               strideD,
-                               strideH,
-                               strideW,
-                               padD,
-                               padH,
-                               padW,
-                               1.0,
-                               1.0);
-  inputGpuGrad->maxPool3DBackward(*targetGpuGrad,
-                                  *maxIdxGpu,
-                                  imgSizeD,
-                                  imgSizeH,
-                                  imgSizeW,
-                                  outD,
-                                  outH,
-                                  outW,
-                                  ksizeD,
-                                  ksizeH,
-                                  ksizeW,
-                                  strideD,
-                                  strideH,
-                                  strideW,
-                                  padD,
-                                  padH,
-                                  padW,
-                                  1.0,
-                                  1.0);
-  MatrixPtr targetBwdCheck =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-  targetBwdCheck->copyFrom(*inputGpuGrad);
-  checkMatrixEqual(inputGrad, targetBwdCheck);
-}
-
-void testAvgPool3DFwdBwd(int numSamples,
-                         int channels,
-                         int imgSizeD,
-                         int imgSizeH,
-                         int imgSizeW,
-                         int ksizeD,
-                         int ksizeH,
-                         int ksizeW,
-                         int strideD,
-                         int strideH,
-                         int strideW,
-                         int padD,
-                         int padH,
-                         int padW) {
-  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeD * imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outD * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->avgPool3DForward(*input,
-                           channels,
-                           imgSizeD,
-                           imgSizeH,
-                           imgSizeW,
-                           outD,
-                           outH,
-                           outW,
-                           ksizeD,
-                           ksizeH,
-                           ksizeW,
-                           strideD,
-                           strideH,
-                           strideW,
-                           padD,
-                           padH,
-                           padW);
-
-  targetGpu->avgPool3DForward(*inputGpu,
-                              channels,
-                              imgSizeD,
-                              imgSizeH,
-                              imgSizeW,
-                              outD,
-                              outH,
-                              outW,
-                              ksizeD,
-                              ksizeH,
-                              ksizeW,
-                              strideD,
-                              strideH,
-                              strideW,
-                              padD,
-                              padH,
-                              padW);
-
-  TensorCheckErr(*target, *targetGpu);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->avgPool3DBackward(*targetGrad,
-                               imgSizeD,
-                               imgSizeH,
-                               imgSizeW,
-                               outD,
-                               outH,
-                               outW,
-                               ksizeD,
-                               ksizeH,
-                               ksizeW,
-                               strideD,
-                               strideH,
-                               strideW,
-                               padD,
-                               padH,
-                               padW,
-                               1.0,
-                               1.0);
-
-  inputGpuGrad->avgPool3DBackward(*targetGpuGrad,
-                                  imgSizeD,
-                                  imgSizeH,
-                                  imgSizeW,
-                                  outD,
-                                  outH,
-                                  outW,
-                                  ksizeD,
-                                  ksizeH,
-                                  ksizeW,
-                                  strideD,
-                                  strideH,
-                                  strideW,
-                                  padD,
-                                  padH,
-                                  padW,
-                                  1.0,
-                                  1.0);
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-// TODO(yi): I noticed many such blindly combinatorial tests in this
-// file.  They are no help to locate defects at all.
-TEST(Matrix, Pool3DFwdBwd) {
-  for (auto numSamples : {1, 3}) {
-    for (auto channels : {3}) {
-      for (auto imgSizeD : {9, 16}) {
-        for (auto imgSizeH : {9, 32}) {
-          for (auto imgSizeW : {9, 32}) {
-            for (auto sizeX : {3}) {
-              for (auto sizeY : {3}) {
-                for (auto sizeZ : {3}) {
-                  for (auto sD : {2}) {
-                    for (auto sH : {2}) {
-                      for (auto sW : {2}) {
-                        for (auto pD : {0, (sizeZ - 1) / 2}) {
-                          for (auto pH : {0, (sizeY - 1) / 2}) {
-                            for (auto pW : {0, (sizeX - 1) / 2}) {
-                              VLOG(3) << " numSamples=" << numSamples
-                                      << " channels=" << channels
-                                      << " imgSizeD=" << imgSizeD
-                                      << " imgSizeH=" << imgSizeH
-                                      << " imgSizeW=" << imgSizeW
-                                      << " sizeX=" << sizeX
-                                      << " sizeY=" << sizeY
-                                      << " sizeZ=" << sizeZ << " strideD=" << sD
-                                      << " strideH=" << sH << " strideW=" << sW
-                                      << " padingD=" << pD << " padingH=" << pH
-                                      << " padingW=" << pW;
-
-                              testMaxPool3DFwdBwd(numSamples,
-                                                  channels,
-                                                  imgSizeD,
-                                                  imgSizeH,
-                                                  imgSizeW,
-                                                  sizeX,
-                                                  sizeY,
-                                                  sizeZ,
-                                                  sD,
-                                                  sH,
-                                                  sW,
-                                                  pD,
-                                                  pH,
-                                                  pW);
-                              testAvgPool3DFwdBwd(numSamples,
-                                                  channels,
-                                                  imgSizeD,
-                                                  imgSizeH,
-                                                  imgSizeW,
-                                                  sizeX,
-                                                  sizeY,
-                                                  sizeZ,
-                                                  sD,
-                                                  sH,
-                                                  sW,
-                                                  pD,
-                                                  pH,
-                                                  pW);
-                            }
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  //  for (auto numSamples : {1, 3}) {
-  //    for (auto channels : {1, 3}) {
-  //      for (auto imgSizeD : {9,16}) {
-  //      for (auto imgSizeH : {9, 32}) {
-  //        for (auto imgSizeW : {9, 32}) {
-  //          for (auto sizeX : {2, 3}) {
-  //            for (auto sizeY : {2, 3}) {
-  //            for (auto sizeZ : {2,3}){
-  //              for (auto sD : {1, 2}) {
-  //              for (auto sH : {1, 2}) {
-  //                for (auto sW : {1, 2}) {
-  //                  for (auto pD : {0, (sizeZ - 1) / 2}){
-  //                  for (auto pH : {0, (sizeY - 1) / 2}) {
-  //                    for (auto pW : {0, (sizeX - 1) / 2}) {
-  //                      VLOG(3) << " numSamples=" << numSamples
-  //                              << " channels=" << channels
-  //                              << " imgSizeD=" << imgSizeD
-  //                              << " imgSizeH=" << imgSizeH
-  //                              << " imgSizeW=" << imgSizeW
-  //                              << " sizeX=" << sizeX
-  //                              << " sizeY=" << sizeY
-  //                              << " sizeZ=" << sizeZ
-  //                              << " strideD=" << sD
-  //                              << " strideH=" << sH
-  //                              << " strideW=" << sW
-  //                              << " padingD=" << pD
-  //                              << " padingH=" << pH
-  //                              << " padingW=" << pW;
-  //
-  //                      testMaxPool3DFwdBwd(numSamples,
-  //                                        channels,
-  //                                        imgSizeD,
-  //                                        imgSizeH,
-  //                                        imgSizeW,
-  //                                        sizeX,
-  //                                        sizeY,
-  //                                        sizeZ,
-  //                                        sD,
-  //                                        sH,
-  //                                        sW,
-  //                                        pD,
-  //                                        pH,
-  //                                        pW);
-  //                      testAvgPool3DFwdBwd(numSamples,
-  //                                        channels,
-  //                                        imgSizeD,
-  //                                        imgSizeH,
-  //                                        imgSizeW,
-  //                                        sizeX,
-  //                                        sizeY,
-  //                                        sizeZ,
-  //                                        sD,
-  //                                        sH,
-  //                                        sW,
-  //                                        pD,
-  //                                        pH,
-  //                                        pW);
-  //                    }
-  //                  }
-  //                }
-  //              }
-  //            }
-  //            }
-  //          }
-  //        }
-  //      }
-  //      }
-  //    }
-  //    }
-  //  }
-  //  }
-}
-
-void testMatrixCol2Vol(int depth, int height, int width) {
-  int channel = 3;
-  int filterX = 3, filterY = 4, filterZ = 5;
-  int strideX = 2, strideY = 2, strideZ = 2;
-  int padX = 1, padY = 1, padZ = 1;
-
-  MatrixPtr cpuImage =
-      std::make_shared<CpuMatrix>(channel, depth * height * width);
-  MatrixPtr gpuImage =
-      std::make_shared<GpuMatrix>(channel, depth * height * width);
-  cpuImage->randomizeUniform();
-  gpuImage->copyFrom(*cpuImage);
-
-  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
-  int outH = outputSize(height, filterY, padY, strideY, true);
-  int outW = outputSize(width, filterX, padX, strideX, true);
-
-  int colBufHeight = channel * filterZ * filterY * filterX;
-  int colBufWidth = outD * outH * outW;
-  MatrixPtr cpuColBuf = std::make_shared<CpuMatrix>(colBufHeight, colBufWidth);
-  MatrixPtr gpuColBuf = std::make_shared<GpuMatrix>(colBufHeight, colBufWidth);
-  cpuColBuf->vol2Col(cpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX);
-  gpuColBuf->vol2Col(gpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX);
-  TensorCheckEqual(*cpuColBuf, *gpuColBuf);
-
-  cpuColBuf->randomizeUniform();
-  gpuColBuf->copyFrom(*cpuColBuf);
-  cpuColBuf->col2Vol(cpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX,
-                     1.0,
-                     1.0);
-  gpuColBuf->col2Vol(gpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX,
-                     1.0,
-                     1.0);
-  TensorCheckErr(*cpuImage, *gpuImage);
-}
-
-TEST(Matrix, col2Vol) {
-  for (auto depth : {9, 16, 64}) {
-    for (auto height : {9, 11, 128}) {
-      for (auto width : {9, 32, 128}) {
-        VLOG(3) << "depth=" << depth << " height=" << height
-                << " width=" << width;
-        testMatrixCol2Vol(depth, height, width);
-      }
-    }
-  }
-}
-
-#endif
diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
deleted file mode 100644
index 86297547dcd83ca87d1c87a8489f7af2f3e9f492..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_matrixUtil.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <gtest/gtest.h>
-#include <paddle/utils/Util.h>
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-
-void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  for (size_t r = 0; r < a->getHeight(); ++r) {
-    for (size_t c = 0; c < a->getWidth(); ++c) {
-      ASSERT_FLOAT_EQ(a->getElement(r, c), b->getElement(r, c));
-    }
-  }
-}
-
-void checkSMatrixEqual(const CpuSparseMatrix& a, const CpuSparseMatrix& b) {
-  ASSERT_EQ(a.getWidth(), b.getWidth());
-  ASSERT_EQ(a.getHeight(), b.getHeight());
-  ASSERT_EQ(a.isTransposed(), b.isTransposed());
-  ASSERT_EQ(a.getFormat(), b.getFormat());
-  ASSERT_EQ(a.getElementCnt(), b.getElementCnt());
-  for (size_t r = 0; r < a.getElementCnt(); ++r) {
-    ASSERT_FLOAT_EQ(a.getValue()[r], b.getValue()[r]);
-  }
-}
-
-void checkSMatrixEqual(const CpuSparseMatrixPtr& a,
-                       const CpuSparseMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  for (size_t r = 0; r < a->getElementCnt(); ++r) {
-    ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-  }
-}
-
-void checkSMatrixEqual2(const CpuSparseMatrixPtr& a,
-                        const CpuSparseMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getValueType(), b->getValueType());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  if (a->getFormat() == SPARSE_CSR) {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-      }
-    }
-    for (size_t r = 0; r <= a->getHeight(); r++) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-    }
-  } else {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-      }
-    }
-    for (size_t r = 0; r <= a->getWidth(); r++) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-    }
-  }
-}
-
-void checkSMatrixEqual2Dense(const CpuSparseMatrix& a, const CpuMatrix& b) {
-  ASSERT_EQ(a.getWidth(), b.getWidth());
-  ASSERT_EQ(a.getHeight(), b.getHeight());
-  ASSERT_EQ(a.isTransposed(), b.isTransposed());
-
-  if (a.getFormat() == SPARSE_CSC) {
-    int* rows = a.getRows();
-    for (size_t i = 0; i < a.getWidth(); i++) {
-      for (size_t j = a.getColStartIdx(i); j < a.getColStartIdx(i + 1); j++) {
-        if (a.getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(rows[j], i));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b.getElement(rows[j], i));
-        }
-      }
-    }
-  } else {
-    int* cols = a.getCols();
-    for (size_t i = 0; i < a.getHeight(); i++) {
-      for (size_t j = a.getRowStartIdx(i); j < a.getRowStartIdx(i + 1); j++) {
-        if (a.getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(i, cols[j]));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b.getElement(i, cols[j]));
-        }
-      }
-    }
-  }
-}
-
-void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
-                             const CpuMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-
-  if (a->getFormat() == SPARSE_CSC) {
-    int* rows = a->getRows();
-    for (size_t i = 0; i < a->getWidth(); i++) {
-      for (size_t j = a->getColStartIdx(i); j < a->getColStartIdx(i + 1); j++) {
-        if (a->getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(rows[j], i));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b->getElement(rows[j], i));
-        }
-      }
-    }
-  } else {
-    int* cols = a->getCols();
-    for (size_t i = 0; i < a->getHeight(); i++) {
-      for (size_t j = a->getRowStartIdx(i); j < a->getRowStartIdx(i + 1); j++) {
-        if (a->getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(i, cols[j]));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b->getElement(i, cols[j]));
-        }
-      }
-    }
-  }
-}
-
-void checkSMatrixErr(const CpuSparseMatrixPtr& a, const CpuSparseMatrixPtr& b) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getValueType(), b->getValueType());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  int count = 0;
-  if (a->getFormat() == SPARSE_CSR) {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        real aVal = a->getValue()[r];
-        real bVal = b->getValue()[r];
-        if (std::abs(aVal - bVal) > err) {
-          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
-            LOG(INFO) << "a=" << aVal << "\t"
-                      << "b=" << bVal;
-            count++;
-          }
-        }
-      }
-    }
-    for (size_t r = 0; r <= a->getHeight(); r++) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-    }
-  } else {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        real aVal = a->getValue()[r];
-        real bVal = b->getValue()[r];
-        if (std::abs(aVal - bVal) > err) {
-          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
-            count++;
-          }
-        }
-      }
-    }
-    for (size_t r = 0; r <= a->getWidth(); r++) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkMatrixErr(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (std::abs(a - b) > err) {
-        if ((std::abs(a - b) / std::abs(a)) > (err / 10.0f)) {
-          count++;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkDataEqual(const real* a, const real* b, size_t size) {
-  for (size_t i = 0; i < size; ++i) {
-    ASSERT_FLOAT_EQ(a[i], b[i]);
-  }
-}
-
-}  //  namespace paddle
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
deleted file mode 100644
index 12647d21a29936e169b893ec8119b64fec9af580..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
-//  so disable when
-/// only cpu version.
-
-#include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Util.h"
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static inline int uniformRandom(int n) { return n == 0 ? 0 : rand() % n; }
-
-void testSpMatrixAddBias(int M, int N, real rate, real scale) {
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_1);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuA->addBias(*cpuB, scale);
-  gpuA->addBias(*gpuB, scale);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuA, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
-                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixAddDense(int M, int N, real rate) {  // add3
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(M, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(M, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuA->add3(cpuB);
-  gpuA->add3(gpuB);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuA, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
-                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixMul(int M, int N, int K, real rate) {
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(M, K);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(N, K);
-  MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz));
-
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(M, K);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(N, K);
-  MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz));
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  gpuC->copyFrom(*cpuC, stream);
-  hl_stream_synchronize(stream);
-
-  cpuC->mul(*cpuA, *cpuB->getTranspose(), 1, 1);
-  gpuC->mul(*gpuA, *gpuB->getTranspose(), 1, 1);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuC, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixErr(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuC),
-                  std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixCollectBias(int M, int N, real rate) {
-  int nnz = M * N * rate;
-  LOG(INFO) << "nnz=" << nnz;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuB->collectBias(*cpuA, 1);
-  gpuB->collectBias(*gpuA, 1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, N);
-  outputCheck->copyFrom(*gpuB, stream);
-  hl_stream_synchronize(stream);
-  checkMatrixErr(*cpuB, *outputCheck);
-}
-
-TEST(SMatrix, sMatrixOp) {
-  for (auto height : {1, 11, 200}) {
-    for (auto width : {200, 2048, 20480}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      for (auto rate : {0.02, 0.1}) {
-        testSpMatrixAddDense(height, width, rate);
-        testSpMatrixAddBias(height, width, rate, 1.0);
-      }
-    }
-  }
-}
-
-TEST(SMatrix, sMatrixMul) {
-  for (auto M : {1, 40, 128, 200}) {
-    for (auto N : {100, 2000, 20480}) {
-      for (auto K : {100, 512, 1024}) {
-        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
-        testSpMatrixMul(M, N, K, 0.05);
-      }
-    }
-  }
-}
-
-TEST(SMatrix, sMatrixCollectBias) {
-  for (auto height : {1, 128, 200}) {
-    for (auto width : {100, 2048, 20480}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      testSpMatrixCollectBias(height, width, 0.1);
-    }
-  }
-}
-
-#endif
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
deleted file mode 100644
index 94522f718a0c19bfc704ca92eddef5c5a9cb6919..0000000000000000000000000000000000000000
--- a/paddle/parameter/Argument.cpp
+++ /dev/null
@@ -1,707 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Argument.h"
-#include "paddle/math/SparseMatrix.h"
-
-#include <algorithm>
-
-namespace paddle {
-static void resizeAndCopy(MatrixPtr& dest,
-                          const MatrixPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    if (!dest) {
-      dest = src->clone(0, 0, useGpu);
-    } else {
-      CHECK_EQ(dest->useGpu(), useGpu);
-      dest->resize(src->getHeight(), src->getWidth());
-    }
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(IVectorPtr& dest,
-                          const IVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    IVector::resizeOrCreate(dest, src->getSize(), useGpu);
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(ICpuGpuVectorPtr& dest,
-                          const ICpuGpuVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    ICpuGpuVector::resizeOrCreate(dest, src->getSize(), useGpu);
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(MatrixPtr& dest,
-                          const MatrixPtr& src,
-                          int32_t startRow,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startRow + copySize, src->getHeight());
-    int height = copySize;
-    int width = src->getWidth();
-    if (!dest) {
-      dest = src->clone(height, width, useGpu);
-    } else {
-      CHECK_EQ(dest->useGpu(), useGpu);
-      dest->resize(height, width);
-    }
-    MatrixPtr submat = src->subMatrix(startRow, copySize);
-    if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
-      // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.
-      // First copy it to CPU, and then copy it to the GPU.
-      MatrixPtr tmp = src->clone(height, width, false);
-      tmp->copyFrom(*submat, stream);
-      dest->copyFrom(*tmp, stream);
-    } else {
-      dest->copyFrom(*submat, stream);
-    }
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(IVectorPtr& dest,
-                          const IVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->getSize());
-
-    int height = copySize;
-    IVector::resizeOrCreate(dest, height, useGpu);
-    dest->copyFrom(src->getData() + startPos, height, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(ICpuGpuVectorPtr& dest,
-                          const ICpuGpuVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->getSize());
-
-    ICpuGpuVector::resizeOrCreate(dest, copySize, useGpu);
-    dest->copyFrom(*src, startPos, copySize, useGpu, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(SVectorPtr& dest,
-                          const SVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    size_t height = src->size();
-    if (!dest) {
-      dest = std::make_shared<std::vector<std::string>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin(), height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(SVectorPtr& dest,
-                          const SVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->size());
-    size_t height = copySize;
-    if (!dest) {
-      dest = std::make_shared<std::vector<std::string>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin() + startPos, height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
-void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
-  resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-}
-
-void Argument::resizeAndCopyFrom(const Argument& src,
-                                 bool useGpu,
-                                 hl_stream_t stream) {
-  dataId = src.dataId;
-  resizeAndCopy(value, src.value, useGpu, stream);
-  resizeAndCopy(grad, src.grad, useGpu, stream);
-  resizeAndCopy(in, src.in, useGpu, stream);
-  resizeAndCopy(ids, src.ids, useGpu, stream);
-  resizeAndCopy(sequenceStartPositions,
-                src.sequenceStartPositions,
-                false /* useGpu */,
-                stream);
-  if (src.hasSubseq()) {
-    resizeAndCopy(subSequenceStartPositions,
-                  src.subSequenceStartPositions,
-                  false /* useGpu */,
-                  stream);
-  }
-  resizeAndCopy(strs, src.strs, useGpu, stream);
-  frameWidth = src.frameWidth;
-  frameHeight = src.frameHeight;
-  frameDepth = src.frameDepth;
-}
-
-int32_t Argument::resizeAndCopyFrom(const Argument& src,
-                                    int32_t startSeq,
-                                    int32_t copySize,
-                                    bool useGpu) {
-  int32_t size =
-      resizeAndCopyFrom(src, startSeq, copySize, useGpu, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  return size;
-}
-
-int32_t Argument::resizeAndCopyFrom(const Argument& src,
-                                    int32_t startSeq,
-                                    int32_t copySize,
-                                    bool useGpu,
-                                    hl_stream_t stream) {
-  dataId = src.dataId;
-  frameWidth = src.frameWidth;
-  frameHeight = src.frameHeight;
-  frameDepth = src.frameDepth;
-
-  if (!src.sequenceStartPositions) {
-    // non-sequence input, copy samples directly
-    int32_t startRow = startSeq;
-    resizeAndCopy(in, src.in, startRow, copySize, useGpu, stream);
-    resizeAndCopy(value, src.value, startRow, copySize, useGpu, stream);
-    resizeAndCopy(grad, src.grad, startRow, copySize, useGpu, stream);
-    resizeAndCopy(ids, src.ids, startRow, copySize, useGpu, stream);
-    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
-    return copySize;
-  } else {
-    // sequence input
-    const int* sequence = src.sequenceStartPositions->getData(false);
-    int32_t startRow = sequence[startSeq];           // sample start from here
-    int32_t endRow = sequence[startSeq + copySize];  // sample end
-    int32_t copyFeatureSize = endRow - startRow;     // num of samples
-    resizeAndCopy(in, src.in, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(value, src.value, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(sequenceStartPositions,
-                  src.sequenceStartPositions,
-                  startSeq,
-                  copySize + 1,
-                  false,
-                  stream);
-    // modify new sequenceStartPositions
-    int* destSequences = sequenceStartPositions->getMutableData(false);
-    for (int i = 0; i < copySize + 1; i++) {
-      destSequences[i] -= startRow;
-    }
-    CHECK_EQ(destSequences[0], 0);
-    CHECK_EQ(destSequences[copySize], copyFeatureSize);
-    if (src.hasSubseq()) {
-      // sequence has sub-sequence
-      int* subSequence = src.subSequenceStartPositions->getMutableData(false);
-      int32_t subStartSeq = 0;
-      int32_t subEndSeq = 0;
-      int numSubSequences = src.getNumSubSequences();
-      for (int i = 0; i < numSubSequences + 1; i++) {
-        if (subSequence[i] == startRow) {
-          subStartSeq = i;
-        } else if (subSequence[i] == endRow) {
-          subEndSeq = i;
-          break;
-        }
-      }
-      int32_t copySubSize = subEndSeq - subStartSeq;
-      resizeAndCopy(subSequenceStartPositions,
-                    src.subSequenceStartPositions,
-                    subStartSeq,
-                    copySubSize + 1,
-                    false,
-                    stream);
-      // modify new subSequenceStartPositions
-      int* destSubSequences = subSequenceStartPositions->getMutableData(false);
-      for (int i = 0; i < copySubSize + 1; i++) {
-        destSubSequences[i] -= startRow;
-      }
-      CHECK_EQ(destSubSequences[0], 0);
-      CHECK_EQ(destSubSequences[copySubSize], copyFeatureSize);
-    }
-    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
-    return copyFeatureSize;
-  }
-}
-
-void Argument::concat(const std::vector<Argument>& args,
-                      const std::vector<int>& selectRows,
-                      const std::vector<int>& seqStartPos,
-                      const std::vector<int>& copySize,
-                      bool useGpu,
-                      hl_stream_t stream,
-                      PassType passType) {
-  CHECK(!subSequenceStartPositions)
-      << "undefined behavior for subsequence positions";
-
-  size_t batchSize = 0;
-  for (size_t i = 0; i < copySize.size(); ++i)
-    batchSize += copySize[i] * (seqStartPos[i + 1] - seqStartPos[i]);
-
-  auto copyArg = [batchSize, stream](MatrixPtr& dst,
-                                     MatrixPtr src,
-                                     int desStartRow,
-                                     int srcStartRow,
-                                     int size,
-                                     bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    size_t width = src->getWidth();
-    if (!dst) {
-      dst = src->clone(batchSize, width, useGpu);
-    } else {
-      dst->resize(batchSize, width);
-    }
-
-    MatrixPtr tmpMatrix = dst->subMatrix(desStartRow, size);
-    tmpMatrix->copyFrom(*src->subMatrix(srcStartRow, size), stream);
-  };
-
-  auto copyIds = [batchSize, stream](IVectorPtr& dst,
-                                     const IVectorPtr& src,
-                                     int desStartRow,
-                                     int srcStartRow,
-                                     int size,
-                                     bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(desStartRow, size)
-        ->copyFrom(*src->subVec(srcStartRow, size), stream);
-  };
-
-  auto copyStrs = [batchSize](SVectorPtr& dst,
-                              const SVectorPtr& src,
-                              int desStartRow,
-                              int srcStartRow,
-                              int size,
-                              bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    if (!dst) {
-      dst = std::make_shared<std::vector<std::string>>(batchSize);
-    } else {
-      dst->resize(batchSize);
-    }
-    std::copy(src->begin() + srcStartRow,
-              src->begin() + srcStartRow + size,
-              dst->begin() + desStartRow);
-  };
-
-  dataId = args[0].dataId;
-  CHECK_NE(seqStartPos.size(), 0UL);
-  int desStartRow = 0;
-  for (size_t i = 0; i < copySize.size(); ++i) {
-    int startPos = seqStartPos[i];
-    int endPos = seqStartPos[i + 1];
-    CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
-    for (int j = startPos; j < endPos; ++j) {
-      const Argument& arg = args[j - startPos];
-      CHECK_EQ(arg.dataId, dataId) << "Arguments to concatenate should have "
-                                   << "the same dataId.";
-      const int srcStartRow = selectRows[j];
-      copyArg(in, arg.in, desStartRow, srcStartRow, copySize[i], useGpu);
-      copyArg(value, arg.value, desStartRow, srcStartRow, copySize[i], useGpu);
-      if (passType != PASS_TEST) {
-        copyArg(grad, arg.grad, desStartRow, srcStartRow, copySize[i], useGpu);
-      }
-      copyIds(ids, arg.ids, desStartRow, srcStartRow, copySize[i], useGpu);
-      copyStrs(strs, arg.strs, desStartRow, srcStartRow, copySize[i], useGpu);
-      desStartRow += copySize[i];
-    }
-  }
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, seqStartPos.size(), useGpu);
-  sequenceStartPositions->copyFrom(
-      seqStartPos.data(), seqStartPos.size(), useGpu);
-}
-
-void Argument::concat(const std::vector<Argument>& args,
-                      bool useGpu,
-                      hl_stream_t stream,
-                      PassType passType) {
-  int32_t batchSize = 0;
-  int64_t numSequences = 0;
-  int64_t numSubSequences = 0;
-  for (auto& arg : args) {
-    batchSize += arg.getBatchSize();
-    numSequences += arg.getNumSequences();
-    numSubSequences += arg.getNumSubSequences();
-  }
-
-  auto copyArg = [batchSize, stream](
-      MatrixPtr& dst, MatrixPtr src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    size_t width = src->getWidth();
-    if (!dst) {
-      dst = src->clone(batchSize, width, useGpu);
-    } else {
-      dst->resize(batchSize, width);
-    }
-
-    MatrixPtr tmpMatrix = dst->subMatrix(startRow, src->getHeight());
-    tmpMatrix->copyFrom(*src, stream);
-  };
-
-  auto copyIds = [batchSize, stream](
-      IVectorPtr& dst, const IVectorPtr& src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(startRow, src->getSize())->copyFrom(*src, stream);
-  };
-
-  auto copyStrs = [batchSize](
-      SVectorPtr& dst, const SVectorPtr& src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    if (!dst) {
-      dst = std::make_shared<std::vector<std::string>>(batchSize);
-    } else {
-      dst->resize(batchSize);
-    }
-    std::copy(src->begin(), src->end(), dst->begin() + startRow);
-  };
-
-  auto copySequencePos = [](ICpuGpuVectorPtr& dstSeq,
-                            const ICpuGpuVectorPtr& srcSeq,
-                            int dstNumSequences,
-                            int srcNumSequences,
-                            int& startSequences,
-                            int startRow) {
-    if (srcSeq) {
-      ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
-      const int* src = srcSeq->getData(false);
-      int* dest = dstSeq->getMutableData(false);
-      for (int i = 0; i < srcNumSequences + 1; ++i) {
-        dest[i + startSequences] = src[i] + startRow;
-      }
-      startSequences += srcNumSequences;
-    } else {
-      dstSeq.reset();
-    }
-  };
-
-  int startRow = 0;
-  int startSequences = 0;
-  int startSubSequences = 0;
-  dataId = args[0].dataId;
-  for (auto& arg : args) {
-    CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
-                                 << " same dataId";
-    copyArg(in, arg.in, startRow, useGpu);
-    copyArg(value, arg.value, startRow, useGpu);
-    if (passType != PASS_TEST) copyArg(grad, arg.grad, startRow, useGpu);
-    copyIds(ids, arg.ids, startRow, useGpu);
-    copySequencePos(sequenceStartPositions,
-                    arg.sequenceStartPositions,
-                    numSequences,
-                    arg.getNumSequences(),
-                    startSequences,
-                    startRow);
-    copySequencePos(subSequenceStartPositions,
-                    arg.subSequenceStartPositions,
-                    numSubSequences,
-                    arg.getNumSubSequences(),
-                    startSubSequences,
-                    startRow);
-    copyStrs(strs, arg.strs, startRow, useGpu);
-    startRow += arg.getBatchSize();
-  }
-}
-
-void Argument::splitByDataId(const std::vector<Argument>& argus,
-                             std::vector<std::vector<Argument>>* arguGroups) {
-  arguGroups->clear();
-  int lastDataId = -1;
-  for (const auto& argu : argus) {
-    if (argu.dataId == -1) {
-      // is -1, then create a new group
-      arguGroups->emplace_back();
-      lastDataId = -1;
-    } else if (argu.dataId != lastDataId) {
-      // not -1, also not equal to last Argument, then create a new group
-      arguGroups->emplace_back();
-      lastDataId = argu.dataId;
-    } else {
-      // not -1, and equal to last Argument, do nothing
-    }
-    arguGroups->back().push_back(argu);
-  }
-}
-
-void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
-  const int* starts = sequenceStartPositions->getData(false);
-  const int* subStarts =
-      hasSubseq() ? subSequenceStartPositions->getData(false) : nullptr;
-  size_t numSequences = getNumSequences();
-  seqInfo->reserve(numSequences);
-  int subSeqEnd = 0;
-  for (size_t i = 0; i < numSequences; ++i) {
-    SeqInfo info;
-    info.seqStart = starts[i];
-    info.subLevelLength = starts[i + 1] - starts[i];
-    info.seqId = i;
-    if (hasSubseq()) {
-      info.subSeqStart = subSeqEnd;
-      while (subStarts[subSeqEnd] < starts[i + 1]) {
-        ++subSeqEnd;
-      }
-      info.topLevelLength = subSeqEnd - info.subSeqStart;
-    } else {
-      info.topLevelLength = info.subLevelLength;
-      info.subSeqStart = 0;  // not used
-    }
-    seqInfo->push_back(info);
-  }
-  std::sort(
-      seqInfo->begin(), seqInfo->end(), [](const SeqInfo& a, const SeqInfo& b) {
-        return a.topLevelLength > b.topLevelLength;
-      });
-}
-
-void Argument::checkSubset() const {
-  if (getNumSequences() > getNumSubSequences()) {
-    LOG(FATAL) << "numSubSequences is less than numSequences ("
-               << getNumSubSequences() << " vs. " << getNumSequences() << ")";
-  }
-  const int* start = sequenceStartPositions->getData(false);
-  const int* subStart = subSequenceStartPositions->getData(false);
-  int seqId = 0;
-  int subSeqId = 0;
-  while (seqId < getNumSequences() && subSeqId < getNumSubSequences()) {
-    if (start[seqId] > subStart[subSeqId]) {
-      ++subSeqId;
-    } else if (start[seqId] == subStart[subSeqId]) {
-      ++subSeqId;
-      ++seqId;
-    } else {
-      LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
-    }
-  }
-  if (seqId < getNumSequences()) {
-    LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
-  }
-}
-
-void Argument::degradeSequence(const Argument& input) {
-  CHECK_EQ(input.hasSubseq(), 1UL);
-  size_t numSequences = input.getNumSequences();
-  size_t numSubSequences = input.getNumSubSequences();
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  int* tgtBuf = sequenceStartPositions->getMutableData(false);
-  const int* starts = input.sequenceStartPositions->getData(false);
-  const int* subStarts = input.subSequenceStartPositions->getData(false);
-  int seqId = 0;
-  for (size_t subSeqId = 0; subSeqId < numSubSequences; ++subSeqId) {
-    if (subStarts[subSeqId] == starts[seqId]) {
-      tgtBuf[seqId] = subSeqId;
-      seqId++;
-    }
-  }
-  tgtBuf[numSequences] = numSubSequences;
-}
-
-void Argument::poolSequenceWithStride(const Argument& input,
-                                      size_t stride,
-                                      ICpuGpuVectorPtr* stridePostions,
-                                      bool reversed) {
-  // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
-  // then sequenceStartPositions = [0, 2, 3, 4, 7].
-  // If reversed = false, stridePostions = [0, 5, 9, 14, 17, 22, 27, 30];
-  // else reversed = true, stridePostions = [0, 4, 9, 14, 17, 20, 25, 30]
-
-  CHECK(input.sequenceStartPositions);
-  CHECK_EQ(input.hasSubseq(), 0UL);
-  CHECK_GT(stride, 0UL) << "stride must larger than 0";
-  size_t numSequences = input.getNumSequences();
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  const int* starts = input.sequenceStartPositions->getData(false);
-  int* tgtBuf = sequenceStartPositions->getMutableData(false);
-  // first index of target sequence and stride positions are both 0
-  tgtBuf[0] = 0;
-  std::vector<int> stridePos;
-  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-    size_t seqLength = starts[seqId + 1] - starts[seqId];
-    stridePos.emplace_back(starts[seqId]);
-    if (seqLength == 0) {
-      // empty sequence
-      tgtBuf[seqId + 1] = tgtBuf[seqId];
-    } else {
-      int size = ceil((float)seqLength / stride);
-      tgtBuf[seqId + 1] = tgtBuf[seqId] + size;
-      for (int i = 0; i < size - 1; ++i) {
-        int cur = reversed ? starts[seqId + 1] - (size - 1 - i) * stride
-                           : stridePos.back() + stride;
-        stridePos.emplace_back(cur);
-      }
-    }
-  }
-  stridePos.emplace_back(starts[numSequences]);
-  int size = stridePos.size();
-  CHECK_EQ(size - 1, tgtBuf[numSequences]);
-  ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
-  (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
-}
-
-void Argument::getValueString(
-    std::unordered_map<std::string, std::string>* out) const {
-  if (value) {
-    std::ostringstream os;
-    value->print(os);
-    out->insert({"value", os.str()});
-  }
-  if (ids) {
-    std::ostringstream os;
-    ids->print(os, ids->getSize());
-    out->insert({"ids", os.str()});
-  }
-  if (sequenceStartPositions) {
-    std::ostringstream os;
-    sequenceStartPositions->getVector(false)->print(
-        os, sequenceStartPositions->getSize());
-    out->insert({"sequence pos", os.str()});
-  }
-  if (subSequenceStartPositions) {
-    std::ostringstream os;
-    subSequenceStartPositions->getVector(false)->print(
-        os, subSequenceStartPositions->getSize());
-    out->insert({"sub-sequence pos", os.str()});
-  }
-}
-
-void Argument::printValueString(std::ostream& stream,
-                                const std::string& prefix) const {
-  std::unordered_map<std::string, std::string> out;
-  getValueString(&out);
-  for (auto field : {"value", "ids", "sequence pos", "sub-sequence pos"}) {
-    auto it = out.find(field);
-    if (it != out.end()) {
-      stream << prefix << field << ":\n" << it->second;
-    }
-  }
-}
-
-void Argument::subArgFrom(const Argument& input,
-                          size_t offset,
-                          size_t height,
-                          size_t width,
-                          bool useGpu,
-                          bool trans,
-                          bool seqFlag,
-                          size_t seqStart,
-                          size_t seqSize) {
-  if (input.value) {
-    value = Matrix::create(
-        input.value->getData() + offset * width, height, width, trans, useGpu);
-  }
-  if (input.ids) {
-    ids = IVector::create(input.ids->getData() + offset, height, useGpu);
-  }
-  if (input.grad) {
-    grad = Matrix::create(
-        input.grad->getData() + offset * width, height, width, trans, useGpu);
-  }
-  if (seqFlag) {
-    sequenceStartPositions = std::make_shared<ICpuGpuVector>(
-        *(input.sequenceStartPositions), seqStart, seqSize);
-  }
-}
-
-void Argument::reorganizeSeqInfo(
-    const ICpuGpuVectorPtr seqStartPos,
-    const ICpuGpuVectorPtr subSeqStartPos,
-    std::vector<std::vector<int>>& reorganizedSeqInfo) {
-  CHECK(seqStartPos);
-  reorganizedSeqInfo.clear();
-
-  int seqNum = seqStartPos->getSize() - 1;
-  int* seqStarts = seqStartPos->getMutableData(false);
-
-  if (subSeqStartPos) {
-    int* subSeqStarts = subSeqStartPos->getMutableData(false);
-    reorganizedSeqInfo.resize(seqNum, std::vector<int>());
-    int seqIdx = 0;
-    for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
-      reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
-      if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
-        seqIdx++;
-        if (seqIdx == seqNum) return;
-        reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
-      }
-    }
-  } else {
-    reorganizedSeqInfo.resize(1, std::vector<int>(seqNum + 1, 0));
-    memcpy(reorganizedSeqInfo[0].data(),
-           seqStarts,
-           sizeof(int) * seqStartPos->getSize());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
deleted file mode 100644
index e580d38216b699360fb30f135be8052ab56abf66..0000000000000000000000000000000000000000
--- a/paddle/parameter/Argument.h
+++ /dev/null
@@ -1,349 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "hl_gpu.h"
-
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-typedef std::shared_ptr<std::vector<std::string>> SVectorPtr;
-
-struct Argument {
-  Argument()
-      : in(nullptr),
-        value(nullptr),
-        ids(nullptr),
-        grad(nullptr),
-        strs(nullptr),
-        frameHeight(0),
-        frameWidth(0),
-        frameDepth(0),
-        sequenceStartPositions(nullptr),
-        subSequenceStartPositions(nullptr),
-        cpuSequenceDims(nullptr),
-        deviceId(-1),
-        allCount(0),
-        valueCount(0),
-        gradCount(0),
-        dataId(0) {}
-  Argument(const Argument& argument) {
-    *this = argument;
-    valueCount = 0;
-    gradCount = 0;
-    dataId = argument.dataId;
-  }
-  ~Argument() {}
-
-  void operator=(const Argument& argument) {
-    in = argument.in;
-    value = argument.value;
-    ids = argument.ids;
-    grad = argument.grad;
-    strs = argument.strs;
-    sequenceStartPositions = argument.sequenceStartPositions;
-    subSequenceStartPositions = argument.subSequenceStartPositions;
-    cpuSequenceDims = argument.cpuSequenceDims;
-    deviceId = argument.deviceId;
-    allCount = argument.allCount;
-    frameHeight = argument.frameHeight;
-    frameWidth = argument.frameWidth;
-    frameDepth = argument.frameDepth;
-    dataId = argument.dataId;
-  }
-
-  MatrixPtr in;  // used if needed
-  MatrixPtr value;
-  IVectorPtr ids;  // a sequence of ids. Can be use for class id for costLayer
-  MatrixPtr grad;  // If empty, gradient is not needed.
-  SVectorPtr strs;
-
-  // A dataBatch includes batchSize frames, one frame maybe not only vector
-  size_t frameHeight;
-  size_t frameWidth;
-  size_t frameDepth;
-
-  // If NULL, each position is treated independently.
-  // Otherwise, its size should be #NumberOfSequences + 1.
-  // The first position is always 0 and
-  // the last position should be equal to batchSize.
-  ICpuGpuVectorPtr sequenceStartPositions;
-
-  // If NULL, each sequence has no subsequence.
-  // Otherwise, its size should be #NumberOfSubSequences + 1.
-  // The first position is always 0 and
-  // the last position should be equal to batchSize.
-  ICpuGpuVectorPtr subSequenceStartPositions;
-
-  // dimension of sequence, stored only in CPU
-  IVectorPtr cpuSequenceDims;
-
-  int deviceId;            // the GPU device id which the argument in
-  int allCount;            // the number of output layers using this argument
-  mutable int valueCount;  // waiting this member when layer do forward
-  mutable int gradCount;   // waiting this member when layer do backward
-  mutable LockedCondition valueReadyCond;
-  mutable LockedCondition gradReadyCond;
-
-  int dataId;  // dataProvider id
-
-  /* Increase the reference count of the argument. */
-  void countIncrement() { allCount++; }
-
-  int getAllCount() const { return allCount; }
-
-  void waitValueReady() const {
-    valueReadyCond.wait([this] { return (valueCount != 0); });
-
-    std::lock_guard<std::mutex> guard(*valueReadyCond.mutex());
-    valueCount--;
-  }
-
-  void notifyValueReady() const {
-    valueReadyCond.notify_all([this] { valueCount = allCount; });
-  }
-
-  void waitGradReady() const {
-    gradReadyCond.wait([this] { return (gradCount == allCount); });
-    gradCount = 0;
-  }
-
-  void notifyGradReady() const {
-    gradReadyCond.notify_all([this] { gradCount++; });
-  }
-
-  int64_t getBatchSize() const {
-    if (value) return value->getHeight();
-    if (ids) return ids->getSize();
-    if (grad) return grad->getHeight();
-    if (in) return in->getHeight();
-    if (strs) return strs->size();
-    return 0;
-  }
-  size_t getFrameHeight() const { return frameHeight; }
-  size_t getFrameWidth() const { return frameWidth; }
-  size_t getFrameDepth() const { return frameDepth; }
-  void setFrameHeight(size_t h) { frameHeight = h; }
-  void setFrameWidth(size_t w) { frameWidth = w; }
-  void setFrameDepth(size_t d) { frameDepth = d; }
-
-  int64_t getNumSequences() const {
-    return sequenceStartPositions ? sequenceStartPositions->getSize() - 1
-                                  : getBatchSize();
-  }
-
-  int64_t getNumSubSequences() const {
-    return subSequenceStartPositions ? subSequenceStartPositions->getSize() - 1
-                                     : getBatchSize();
-  }
-
-  bool hasSeq() const { return sequenceStartPositions != nullptr; }
-  bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
-
-  const int* getCpuStartPositions() const {
-    return hasSubseq() ? subSequenceStartPositions->getData(false)
-                       : sequenceStartPositions->getData(false);
-  }
-
-  static inline real sum(const std::vector<Argument>& arguments) {
-    real cost = 0;
-    for (auto& arg : arguments) {
-      if (arg.value) {
-        SetDevice device(arg.deviceId);
-        cost += arg.value->getSum();
-      }
-    }
-    return cost;
-  }
-
-  /**
-   * @brief (value, ids, grad, sequenceStartPositions) of output are subset of
-   *        input. Note that, output share the same memory of input.
-   *
-   * @param input[in]       input
-   * @param offset[in]      offset in terms of rows
-   * @param height[in]      height of output.value
-   * @param width[in]       width of output.value
-   * @param useGpu[in]
-   * @param trans[in]       whether input.value is transform
-   * @param seqFlag[in]     whether input has sequenceStartPositions
-   * @param seqStart[in]    offset of input.sequenceStartPositions
-   * @param seqSize[in]     lenght of output.sequenceStartPositions
-   */
-  void subArgFrom(const Argument& input,
-                  size_t offset,
-                  size_t height,
-                  size_t width,
-                  bool useGpu,
-                  bool trans = false,
-                  bool seqFlag = false,
-                  size_t seqStart = 0,
-                  size_t seqSize = 0);
-  /*
-   * for sequence input:
-   *   startSeq: the sequence id of start
-   *   copySize: how many sequences need to copy
-   *   return value: how many samples are copied
-   * for non-sequence input:
-   *   startSeq: the sample id of start
-   *   copySize: how many samples need to copy
-   *   return value: how many samples are copied
-   * Note that when specifying the stream explicitly in this case,
-   * synchronize should also be called somewhere after this function
-   */
-  int32_t resizeAndCopyFrom(const Argument& src,
-                            int32_t startSeq,
-                            int32_t copySize,
-                            bool useGpu,
-                            hl_stream_t stream);
-
-  /*
-   * same with the above function, except that the stream is
-   * HPPL_STREAM_DEFAULT and synchronize is automatically called
-   * inside it
-   */
-  int32_t resizeAndCopyFrom(const Argument& src,
-                            int32_t startSeq,
-                            int32_t copySize,
-                            bool useGpu = FLAGS_use_gpu);
-
-  void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
-
-  /*
-   * same with the above function, except that the stream is
-   * HPPL_STREAM_DEFAULT and synchronize is automatically called
-   * inside it
-   */
-  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
-
-  /*
-    @brief Concatenate several arguments into one and put the result into it.
-    @param args : a vector of argument, each element of which is a frame in a
-    batch of sequences.
-    @param selectRows : select several row of args to concatenate
-    @param seqStartPos : sequence start positions in the final Argument
-    @param hl_stream_t : cuda stream
-    @param passTyoe : type of task, training or testing
-   */
-  void concat(const std::vector<Argument>& args,
-              const std::vector<int>& selectRows,
-              const std::vector<int>& seqStartPos,
-              const std::vector<int>& copySize,
-              bool useGpu,
-              hl_stream_t stream,
-              PassType passType);
-
-  /*
-    Concatenate several args into one and put the result into this.
-   */
-  void concat(const std::vector<Argument>& src,
-              bool useGpu = FLAGS_use_gpu,
-              hl_stream_t stream = HPPL_STREAM_DEFAULT,
-              PassType passType = PASS_TEST);
-
-  /*
-   * split vector<Argument> to several vectors according to dataId
-   */
-  static void splitByDataId(const std::vector<Argument>& argus,
-                            std::vector<std::vector<Argument>>* arguGroups);
-
-  struct SeqInfo {
-    // Equal to sequence length for sequence data
-    // Equal to number of subsequences for subsequence data
-    int topLevelLength;
-
-    int seqStart;
-    int seqId;
-
-    // Equal to topLevelLength for sequence data
-    // Equal to sum of the length of subsequences for subsequence data
-    int subLevelLength;
-
-    // Only used for subsequence data, start position of this sequence
-    // is subSequenceStartPositions, i.e.
-    // subSequenceStartPositions[subSeqStart] == seqStart
-    int subSeqStart;
-  };
-  /*
-    Get SeqInfo for each sequence of this argument
-    Elements in *seqInfo are sorted by topLevelLength in descending order
-  */
-  void getSeqInfo(std::vector<SeqInfo>* segInfo) const;
-
-  /*
-   Check Whether sequenceStartPositions is subset of
-   subSequenceStartPositions.
-   */
-  void checkSubset() const;
-
-  /*
-   sequence has sub-sequence degrades to a sequence.
-   */
-  void degradeSequence(const Argument& input);
-
-  /*
-   After pooling with stride n (n is smaller than sequence length),
-   a long sequence will be shorten.
-   This function is invalid for sequence having sub-sequence.
-   */
-  void poolSequenceWithStride(const Argument& input,
-                              size_t stride,
-                              ICpuGpuVectorPtr* stridePositions,
-                              bool reversed = false);
-  /**
-   * @brief getValueString will return the argument's output in string. There
-   * are several kinds of output. The keys of output dictionary are 'value',
-   * 'id', 'sequence pos', 'sub-sequence pos'.
-   * @param out [out]: the return values.
-   */
-  void getValueString(std::unordered_map<std::string, std::string>* out) const;
-
-  /**
-   * @brief printValueString will print the argument's output in order of
-   * 'value', 'id', 'sequence pos', 'sub-sequence pos'.
-   * @param stream: Output stream
-   * @param prefix: line prefix for printing.
-   */
-  void printValueString(std::ostream& stream,
-                        const std::string& prefix = "") const;
-
-  /**
-   * @brief reorganizeSeqInfo will reorganize sequenceStartPositions and
-   * subSequenceStartPositions into a 2 dimensional arrary: reorganizedSeqInfo.
-   *
-   * @param seqStartPos: sequenceStartPositions of an Argument.
-   * @param subSeqStartPos: subSequenceStartPositions of an Argument.
-   * @param the reorganized sequence start position information.
-   *
-   * Examples:
-   * seqStartPos: [0, 4, 15, 20, 28]
-   * subSeqStartPos: [0, 3, 4, 5, 7, 10, 15, 20, 22, 23, 25, 28]
-   * reorganizedSeqInfo:
-   *   [
-   *     [0,3,4],
-   *     [4,5,7,10,15],
-   *     [15,20],
-   *     [20,22,23,25,28]
-   *   ]
-   */
-  static void reorganizeSeqInfo(
-      const ICpuGpuVectorPtr seqStartPos,
-      const ICpuGpuVectorPtr subSeqStartPos,
-      std::vector<std::vector<int>>& reorganizedSeqInfo);
-};
-
-}  // namespace paddle
diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp
deleted file mode 100644
index 182e833405e8f8bc3a4c9ffddbf628040f9cceaa..0000000000000000000000000000000000000000
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FirstOrderOptimizer.h"
-#include "paddle/math/TrainingAlgorithmOp.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Util.h"
-
-#include <cmath>
-
-DEFINE_bool(log_clipping, false, "enable log clipping or not");
-
-namespace paddle {
-
-SparseMomentumParameterOptimizer::SparseMomentumParameterOptimizer(
-    const OptimizationConfig& optConfig)
-    : ParameterOptimizer(optConfig) {
-  addParameterType(PARAMETER_MOMENTUM);
-  addParameterType(PARAMETER_MOMENTUM_UT);
-  addParameterType(PARAMETER_MOMENTUM_VT);
-  alpha_ = 1;
-  beta_ = 1;
-  tau_ = -1;
-  threshold_ = 1e+06;
-}
-
-void SparseMomentumParameterOptimizer::init(size_t numRows,
-                                            const ParameterConfig* config) {
-  isParameterSparse_ = numRows != 0;
-  t0Vec_.resize(numRows);
-  t0Vec_.assign(t0Vec_.size(), 0);
-  timer_ = 0;
-  momentum_ = config->momentum();
-  decayRate_ = config->decay_rate();
-  gamma_ = config->learning_rate();
-}
-
-void SparseMomentumParameterOptimizer::startBatch(int64_t numSamplesProcessed) {
-  learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  if (isParameterSparse_) {
-    tau_ = tau_ + beta_ / alpha_;
-    alpha_ = alpha_ / momentum_;
-    beta_ = beta_ / (1 + decayRate_ * gamma_ * learningRate_);
-  }
-}
-
-void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[],
-                                              const ParameterConfig& paraConfig,
-                                              size_t sparseId) const {
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    if (t0Vec_[sparseId] == 0) {
-      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
-      t0Vec_[sparseId] = 1;
-    }
-    vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
-                                     -alpha_ * gamma_ * learningRate_);
-    vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
-                                     tau_ * alpha_ * gamma_ * learningRate_);
-    vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
-                               tau_ / beta_ + 1.0 / alpha_,
-                               *vecs[PARAMETER_MOMENTUM_VT],
-                               1.0 / beta_);
-
-  } else {
-    vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                     *vecs[PARAMETER_MOMENTUM],
-                                     learningRate_ * paraConfig.learning_rate(),
-                                     paraConfig.momentum(),
-                                     applyDecay_ ? paraConfig.decay_rate() : 0);
-  }
-}
-
-ParameterOptimizer::TraverseCallback
-SparseMomentumParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  if (alpha_ > threshold_ && isParameterSparse_) {
-    //  Restart to avoid large value multiplication
-    //  1. \alpha = 1, \beta = 1, \tau = 0
-    //  2. Note that \tau * u_t + v_t = \beta \theta_t, therefore:
-    //     u_t should be rescaled to u_t/alpha_
-    //     v_t should be reset to \theta_t
-    return [this](const VectorPtr vecs[],
-                  const ParameterConfig& config,
-                  size_t sparseId) {
-      vecs[PARAMETER_MOMENTUM_UT]->divScalar(alpha_);
-      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
-    };
-  } else {
-    return nullptr;
-  }
-}
-
-void SparseMomentumParameterOptimizer::finishBatch() {
-  timer_++;
-  if (!isParameterSparse_) return;
-  if (alpha_ > threshold_) {
-    alpha_ = 1;
-    beta_ = 1;
-    tau_ = -1;
-  }
-}
-
-void AdagradParameterOptimizer::update(const VectorPtr vecs[],
-                                       const ParameterConfig& config,
-                                       size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  adagradApply(value,
-               grad,
-               mom,
-               accum_buffer,
-               accum,
-               lr,
-               epsilon,
-               learningRate,
-               momentum,
-               decayRate);
-}
-
-ParameterOptimizer::TraverseCallback
-AdagradParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  if (numUpdates_ % kMaxNumAccumulates == 0) {
-    // Move the sum to a different buffer to avoid loss of precision
-    // due to too many sums.
-    return [](const VectorPtr vecs[],
-              const ParameterConfig& config,
-              size_t sparseId) {
-      vecs[PARAMETER_GRADIENT_SQURESUM]->add(
-          *vecs[PARAMETER_GRADIENT_SQURESUM1]);
-      vecs[PARAMETER_GRADIENT_SQURESUM1]->zeroMem();
-    };
-  } else {
-    return nullptr;
-  }
-}
-
-void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
-                                        const ParameterConfig& config,
-                                        size_t sparseId) const {
-  CHECK(sparseId == -1LU) << "Sparse update is not supported";
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  adadeltaApply(value,
-                grad,
-                mom,
-                accum,
-                accum_update,
-                lr,
-                rou_,
-                epsilon_,
-                learningRate,
-                momentum,
-                decayRate);
-}
-
-void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
-                                       const ParameterConfig& config,
-                                       size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real accumulatedRou = rou_;
-  bool firstTime = timer_ == 0;
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
-    firstTime = t0Vec_[sparseId] == 0;
-    t0Vec_[sparseId] = timer_ + 1;
-  }
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  rmspropApply(value,
-               grad,
-               mom,
-               sum,
-               sum1,
-               lr,
-               accumulatedRou,
-               rou_,
-               epsilon,
-               learningRate,
-               momentum,
-               decayRate,
-               firstTime);
-}
-
-void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
-                                              const ParameterConfig& config,
-                                              size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real accumulatedRou = rou_;
-  bool firstTime = timer_ == 0;
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
-    firstTime = t0Vec_[sparseId] == 0;
-    t0Vec_[sparseId] = timer_ + 1;
-  }
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  decayedAdagradApply(value,
-                      grad,
-                      mom,
-                      sum,
-                      lr,
-                      accumulatedRou,
-                      rou_,
-                      epsilon,
-                      learningRate,
-                      momentum,
-                      decayRate,
-                      firstTime);
-}
-
-void AdamParameterOptimizer::update(const VectorPtr vecs[],
-                                    const ParameterConfig& config,
-                                    size_t sparseId) const {
-  CHECK(sparseId == -1UL) << "Sparse update is not supported";
-
-  real beta1_power = std::pow(beta1_, step_);
-  real beta2_power = std::pow(beta2_, step_);
-  real learningRate = config.learning_rate() * learningRate_;
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM];
-
-  adamApply(value,
-            grad,
-            mom,
-            v,
-            beta1_,
-            beta2_,
-            beta1_power,
-            beta2_power,
-            epsilon_,
-            learningRate);
-}
-
-void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
-                                      const ParameterConfig& config,
-                                      size_t sparseId) const {
-  CHECK(sparseId == -1UL) << "Sparse update is not supported";
-  real learningRate = config.learning_rate() * learningRate_;
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM];
-
-  adamaxApply(value, grad, mom, u, beta1_, beta2_, step_, learningRate);
-}
-
-void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
-                                           const ParameterConfig& config,
-                                           size_t sparseId) const {
-  real globalThreshold = optConfig_.gradient_clipping_threshold();
-  real localThreshold = config.gradient_clipping_threshold();
-
-  // Use local gradient clipping threshold if it's enabled,
-  // otherwise using the global one.
-  real threshold = localThreshold > 0.0f ? localThreshold : globalThreshold;
-  std::string field = localThreshold > 0.0f ? "local" : "global";
-
-  real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax();
-  if (maxAbsGrad > threshold) {
-    if (FLAGS_log_clipping) {
-      real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() /
-                        vecs[PARAMETER_GRADIENT]->getSize();
-      LOG(INFO) << "parameter=" << config.name() << " need clipping by "
-                << field << " threshold=" << threshold
-                << ", max grad=" << maxAbsGrad << ", avg grad=" << avgAbsGrad;
-    }
-    vecs[PARAMETER_GRADIENT]->clip(-threshold, threshold);
-  }
-  optimizer_->update(vecs, config, sparseId);
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
deleted file mode 100644
index 0e6ea90f3d582e843c62bda000313eb71289d5b4..0000000000000000000000000000000000000000
--- a/paddle/parameter/Parameter.cpp
+++ /dev/null
@@ -1,425 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Parameter.h"
-#include <gflags/gflags.h>
-#include <fstream>
-#include "AverageOptimizer.h"
-#include "FirstOrderOptimizer.h"
-#include "OptimizerFunctions.h"
-#include "OptimizerWithRegularizer.h"
-#include "ParameterUpdateFunctions.h"
-#include "ThreadLocalBuffer.h"
-#include "hl_gpu.h"
-#include "paddle/math/CpuSparseMatrix.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/SparseRowMatrix.h"
-#include "paddle/utils/Logging.h"
-
-DEFINE_int32(enable_grad_share,
-             (100 * 1024 * 1024),
-             "threshold for enable gradient parameter share for batch "
-             "multi-cpu training");
-DEFINE_int32(
-    grad_share_block_num,
-    64,
-    "block number of gradient parameter share for batch multi-cpu training");
-
-namespace paddle {
-
-const std::string Parameter::kMissParameterFail = "fail";
-const std::string Parameter::kMissParameterRand = "rand";
-const std::string Parameter::kMissParameterZero = "zero";
-
-Parameter::Parameter(const ParameterConfig& config, bool useGpu, bool doInit)
-    : config_(config),
-      useGpu_(useGpu),
-      deviceId_(-1),
-      sharedCount_(0),
-      updateCounter_(0),
-      updated_(false),
-      headerFormat_(PARAM_FORMAT_ORIGINAL) {
-  setID(-1); /* capture uninitialized id */
-  if (useGpu_ && FLAGS_parallel_nn) {
-    /* gpu environment is specified by device property */
-    deviceId_ = config_.device();
-    if (deviceId_ < 0) {
-      useGpu_ = false;
-    }
-  }
-
-  if (doInit) {
-    initialize();
-  }
-
-  for (int i = 0; i < config.update_hooks_size(); ++i) {
-    this->updaterHooks_.push_back(IParameterUpdaterHook::create(config, i));
-  }
-}
-
-void Parameter::initialize() {
-  SetDevice device(deviceId_);
-
-  bufs_[PARAMETER_VALUE] =
-      Vector::createParallelVector(config_.size(), useGpu_);
-  bufs_[PARAMETER_VALUE]->zeroMem();
-
-  if (config_.is_sparse()) {
-    enableSparseParameter();
-  }
-
-  if (!isStatic()) {
-    bufs_[PARAMETER_GRADIENT] =
-        Vector::createParallelVector(config_.size(), useGpu_);
-    bufs_[PARAMETER_MOMENTUM] =
-        Vector::createParallelVector(config_.size(), useGpu_);
-
-    bufs_[PARAMETER_GRADIENT]->zeroMem();
-    bufs_[PARAMETER_MOMENTUM]->zeroMem();
-  }
-}
-
-void Parameter::randomize(const VectorPtr& value,
-                          const ParameterConfig& config) {
-  if (PARAMETER_INIT_UNIFORM == config.initial_strategy()) {
-    // initialize the parameter as uniform distribution
-    real initial_min = config.initial_mean() - config.initial_std();
-    real initial_max = config.initial_mean() + config.initial_std();
-    value->uniform(initial_min, initial_max);
-    VLOG(1) << config.name() << ": initial_min=" << initial_min
-            << ", initial_max=" << initial_max;
-  } else if (PARAMETER_INIT_NORMAL == config.initial_strategy()) {
-    /* Initialize the parameters randomly */
-    value->randnorm(config.initial_mean(), config.initial_std());
-    VLOG(1) << config.name() << ": initial_mean=" << config.initial_mean()
-            << ", initial_std=" << config.initial_std();
-  } else {
-    LOG(FATAL) << "not supported initial_strategy: "
-               << config.initial_strategy();
-  }
-}
-
-void Parameter::randomize() {
-  if (!bufs_[PARAMETER_VALUE]) return;
-  SetDevice device(deviceId_);
-  Parameter::randomize(bufs_[PARAMETER_VALUE], config_);
-
-  if (config_.is_sparse()) {
-    if (format_ == SPARSE_CSC) {
-      sparseRand(intBufs_[PARAMETER_COLS]->getData(),
-                 intBufs_[PARAMETER_ROWS]->getData(),
-                 config_.size(),
-                 config_.dims(1) + 1,
-                 config_.dims(0),
-                 useGpu_);
-    } else {
-      sparseRand(intBufs_[PARAMETER_ROWS]->getData(),
-                 intBufs_[PARAMETER_COLS]->getData(),
-                 config_.size(),
-                 config_.dims(0) + 1,
-                 config_.dims(1),
-                 useGpu_);
-    }
-  }
-  setValueUpdated();
-}
-
-void Parameter::zeroMem() {
-  if (!bufs_[PARAMETER_VALUE]) return;
-  bufs_[PARAMETER_VALUE]->zeroMem();
-  setValueUpdated();
-  LOG(INFO) << getName() << " set to 0";
-}
-
-bool Parameter::isGradShared(size_t* blockNum) {
-  if (!useGpu_ && !isStatic() && FLAGS_enable_grad_share > 0 &&
-      !isGradSparseUpdate() &&
-      this->getSize() > (size_t)FLAGS_enable_grad_share) {
-    if (blockNum) {
-      *blockNum = (size_t)FLAGS_grad_share_block_num;
-    }
-    return true;
-  }
-  return false;
-}
-
-bool Parameter::isValueShared() {
-  return !useGpu_ && config_.is_shared() && FLAGS_trainer_count > 1;
-}
-
-bool Parameter::isGradSparseUpdate() const {
-  return !useGpu_ && !isStatic() &&
-         (config_.sparse_update() || config_.sparse_remote_update());
-}
-
-void Parameter::setMat(ParameterType pType, int matType) {
-  CHECK(!mats_[pType]);
-
-  if (config_.dims_size() == 0 && matType == MAT_NORMAL) {
-    return;
-  }
-
-  CHECK_EQ((size_t)config_.dims_size(), 2LU);
-  size_t height = config_.dims(0);
-  size_t width = config_.dims(1);
-  if (matType == MAT_NORMAL) {
-    if (!config_.is_sparse()) {
-      CHECK_EQ(height * width, bufs_[pType]->getSize());
-      mats_[pType] =
-          Matrix::create(bufs_[pType]->getMemoryHandle(), height, width);
-    } else {
-      size_t size = bufs_[pType]->getSize();
-      CHECK_GE(height * width, size);
-      if (format_ == SPARSE_CSR) {
-        CHECK_EQ(height + 1, intBufs_[PARAMETER_ROWS]->getSize());
-        CHECK_EQ(size, intBufs_[PARAMETER_COLS]->getSize());
-      } else {
-        CHECK_EQ(width + 1, intBufs_[PARAMETER_COLS]->getSize());
-        CHECK_EQ(size, intBufs_[PARAMETER_ROWS]->getSize());
-      }
-      mats_[pType] =
-          Matrix::createSparseMatrix(bufs_[pType]->getData(),
-                                     intBufs_[PARAMETER_ROWS]->getData(),
-                                     intBufs_[PARAMETER_COLS]->getData(),
-                                     height,
-                                     width,
-                                     bufs_[pType]->getSize(),
-                                     FLOAT_VALUE,
-                                     format_,
-                                     false,
-                                     useGpu_);
-    }
-  }
-#ifndef PADDLE_MOBILE_INFERENCE
-  // NOLINTNEXTLINE
-  else if (matType == MAT_NORMAL_SHARED) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    size_t blockNum = 0;
-    CHECK(isGradShared(&blockNum));
-    mats_[pType] = std::make_shared<SharedCpuMatrix>(
-        blockNum,
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_VALUE_SHARED) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    mats_[pType] = std::make_shared<SharedCpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_SPARSE_ROW_IDS) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    mats_[pType] = std::make_shared<SparseRowIdsCpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_SPARSE_ROW) {
-    auto valueMat =
-        std::dynamic_pointer_cast<SparseRowCpuMatrix>(mats_[PARAMETER_VALUE]);
-    SparseRowCpuMatrix::IndexDictPtr indexDict(nullptr);
-    if (pType != PARAMETER_VALUE) {
-      CHECK(valueMat) << "The matrix for PARAMETER_VALUE must be set "
-                      << " and its type must be MAT_SPARSE_ROW,"
-                      << " MAT_SPARSE_ROW_PREFETCH or MAT_CACHE_ROW";
-      indexDict = valueMat->getIndexDictHandle();
-    }
-    auto mat =
-        std::make_shared<SparseRowCpuMatrix>(nullptr,
-                                             height,
-                                             width,
-                                             // grad share index with value
-                                             indexDict);
-    mats_[pType] = mat;
-  } else if (matType == MAT_CACHE_ROW) {
-    CHECK(isGradSparseUpdate());
-    auto mat = std::make_shared<CacheRowCpuMatrix>(height, width);
-    mats_[pType] = mat;
-  } else if (matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
-             matType == MAT_SPARSE_ROW_PREFETCH) {
-    auto mat = std::make_shared<SparsePrefetchRowCpuMatrix>(
-        bufs_[pType] ? std::dynamic_pointer_cast<CpuMemoryHandle>(
-                           bufs_[pType]->getMemoryHandle())
-                     : nullptr,
-        height,
-        width,
-        nullptr,  // indexDictHandle
-        getGlobalSyncThreadPool());
-    mats_[pType] = mat;
-  } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
-    CHECK(isGradSparseUpdate());
-    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
-  }
-#endif
-  // NOLINTNEXTLINE
-  else {
-    LOG(FATAL) << "Unsupported mat type" << matType;
-  }
-}
-
-void Parameter::incUpdate(const UpdateCallback& callback) {
-  // Static parameter is fixed, and does not need to be updated
-  if (isStatic()) {
-    return;
-  }
-
-  ++updateCounter_;
-  if (isUpdatable()) {
-    if (callback) callback(this);
-    clearUpdate();
-  }
-}
-
-bool Parameter::save(const std::string& filename) const {
-  std::ofstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-  return save(fs);
-}
-
-bool Parameter::save(std::ostream& s) const {
-  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
-  Header header;
-  header.format = headerFormat_;
-  header.valueSize = sizeof(real);
-  header.size = getSize();
-
-  CHECK_EQ(header.size, vec.getSize());
-
-  CHECK(s.write(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to write parameter " << getName();
-
-  CHECK(s.write(reinterpret_cast<char*>(vec.getData()),
-                header.size * sizeof(real)))
-      << "Fail to write parameter " << getName();
-  if (config_.is_sparse()) {
-    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
-    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
-    CHECK(s.write(reinterpret_cast<char*>(rows.getData()),
-                  rows.getSize() * sizeof(int)))
-        << "Fail to write parameter " << getName();
-    CHECK(s.write(reinterpret_cast<char*>(cols.getData()),
-                  cols.getSize() * sizeof(int)))
-        << "Fail to write parameter " << getName();
-  }
-
-  return true;
-}
-
-/**
- * Load parameter value from a file
- */
-bool Parameter::load(const std::string& filename) {
-  std::ifstream fs(filename, std::ios_base::binary);
-  if (!fs) {
-    LOG(INFO) << "missing parameters [" << filename << "] while loading model.";
-    if (kMissParameterFail == FLAGS_load_missing_parameter_strategy) {
-      LOG(FATAL) << getName() << " missing, not allowed.";
-      return false;
-    }
-    if (kMissParameterRand == FLAGS_load_missing_parameter_strategy) {
-      LOG(INFO) << getName() << " missing, set to random.";
-      randomize();
-      return true;
-    }
-    if (kMissParameterZero == FLAGS_load_missing_parameter_strategy) {
-      LOG(INFO) << getName() << " missing, set to zero.";
-      zeroMem();
-      return true;
-    }
-    LOG(FATAL) << "unsupported load_missing_parameter_strategy: "
-               << FLAGS_load_missing_parameter_strategy;
-    return false;
-  }
-  return load(fs);
-}
-
-bool Parameter::load(std::istream& s) {
-  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
-  Header header;
-  CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to read parameter " << getName();
-  CHECK(isHeaderFormatSupported(header.format)) << "Incorrect format version: "
-                                                << header.format;
-  headerFormat_ = header.format;
-  CHECK_EQ(header.size, getSize())
-      << "The size (" << header.size << ") in the file does not match the size "
-      << "(" << getSize() << ") of the parameter: " << getName();
-  CHECK_EQ(header.valueSize, sizeof(real))
-      << "Unsupported valueSize " << header.valueSize << " at: " << getName();
-  CHECK(s.read(reinterpret_cast<char*>(vec.getData()),
-               header.size * sizeof(real)));
-
-  auto& tmp = *bufs_[PARAMETER_VALUE].get();
-  if (typeid(tmp) == typeid(GpuVector)) {
-    bufs_[PARAMETER_VALUE]->copyFrom(vec);
-  }
-
-  if (config_.is_sparse() && config_.need_compact()) {
-    // load from dense parameter with many zero
-    CHECK_EQ(config_.dims_size(), 2);
-    auto height = config_.dims(0);
-    auto width = config_.dims(1);
-    auto mat = Matrix::create(vec.getData(), height, width);
-    CpuSparseMatrix sparseMat(height,
-                              width,
-                              0,
-                              FLOAT_VALUE,
-                              format_,
-                              /*trans*/ false);
-    sparseMat.copyFrom(*mat, HPPL_STREAM_DEFAULT);
-    auto nnz = sparseMat.getElementCnt();
-    size_t rowSize = (format_ == SPARSE_CSR) ? height + 1 : nnz;
-    size_t colSize = (format_ == SPARSE_CSR) ? nnz : width + 1;
-
-    intBufs_[PARAMETER_ROWS]->copyFrom(sparseMat.getRows(), rowSize);
-    intBufs_[PARAMETER_COLS]->copyFrom(sparseMat.getCols(), colSize);
-    bufs_[PARAMETER_VALUE]->resize(nnz);  // for setMat check
-    bufs_[PARAMETER_VALUE]->copyFrom(sparseMat.getValue(), nnz);
-    config_.set_size(nnz);
-    LOG(INFO) << "compact nnz=" << (1. * nnz / (height * width))
-              << " name=" << config_.name();
-  } else if (config_.is_sparse()) {
-    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
-    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
-    size_t rowSize, colSize;
-    CHECK_EQ(config_.dims_size(), 2);
-    if (format_ == SPARSE_CSR) {
-      rowSize = config_.dims(0) + 1;
-      colSize = config_.size();
-    } else {
-      rowSize = config_.size();
-      colSize = config_.dims(1) + 1;
-    }
-    CHECK(
-        s.read(reinterpret_cast<char*>(rows.getData()), rowSize * sizeof(int)));
-    CHECK(
-        s.read(reinterpret_cast<char*>(cols.getData()), colSize * sizeof(int)));
-    auto& paramRows = *intBufs_[PARAMETER_ROWS].get();
-    if (typeid(paramRows) == typeid(GpuIVector)) {
-      intBufs_[PARAMETER_ROWS]->copyFrom(rows);
-    }
-    auto& paramCols = *intBufs_[PARAMETER_COLS].get();
-    if (typeid(paramCols) == typeid(GpuIVector)) {
-      intBufs_[PARAMETER_COLS]->copyFrom(cols);
-    }
-  }
-
-  setValueUpdated();
-
-  return true;
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
deleted file mode 100644
index ef519bf35a4f051b4477eb04b5eb2c5f0b5e29e8..0000000000000000000000000000000000000000
--- a/paddle/parameter/Parameter.h
+++ /dev/null
@@ -1,380 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "ParameterConfig.pb.h"
-#include "TrainerConfig.pb.h"
-
-#include "ParameterUpdaterHook.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-typedef enum {
-  /// The paddle original basic format
-  PARAM_FORMAT_ORIGINAL = 0,
-
-  /// See mkldnn_memory_format_t in
-  /// https://github.com/01org/mkl-dnn/blob/master/include/mkldnn_types.h
-  /// for a detailed description.
-  /// 2D weights tensor in the format (output channels, input channels).
-  PARAM_FORMAT_MKLDNN_OI,
-
-  /// The total format items numbers
-  PARAM_FORMAT_ITEMS,
-} PARAM_FORMAT;
-
-class SparsePrefetchRowCpuMatrix;
-
-class Parameter;
-typedef std::function<void(Parameter* param)> UpdateCallback;
-typedef std::function<void(int paramId, Parameter* param)> ParamInitCallback;
-
-class Parameter;
-typedef std::shared_ptr<Parameter> ParameterPtr;
-
-class Parameter {
- public:
-  Parameter(const ParameterConfig& config, bool useGpu, bool doInit = true);
-  const std::string& getName() const { return config_.name(); }
-
-  size_t getSize() const { return config_.size(); }
-
-  bool isFullSize() const {
-    if (bufs_[PARAMETER_VALUE]) {
-      return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
-    }
-    return false;
-  }
-
-  inline bool useGpu() const { return useGpu_; }
-
-  int getDeviceId() const { return deviceId_; }
-
-  void setDevice(int deviceId) { deviceId_ = deviceId; }
-
-  /// The id ranges from 0 to the_total_number_of_parameters - 1
-  size_t getID() const { return config_.para_id(); }
-
-  /// ID is a implict value created until neural network is built.
-  void setID(size_t id) { config_.set_para_id(id); }
-
-  bool isStatic() const { return config_.is_static(); }
-
-  enum MatType {
-    MAT_NORMAL,
-    /// both value and grad are shared
-    MAT_NORMAL_SHARED,
-
-    /// Now used in BatchNorm in CPU mode
-    MAT_VALUE_SHARED,
-
-    /// sparse matrix, which has full size parameter
-    MAT_SPARSE_ROW_IDS,
-    /// sparse matrix, parameter size scale by sparse rates.
-    MAT_SPARSE_ROW_AUTO_GROW,
-    MAT_CACHE_ROW,
-    MAT_SPARSE_ROW,
-
-    /// sparse matrix for prefetching parameter from pserver
-    MAT_SPARSE_ROW_PREFETCH,
-    /// same as above, but parameter has full size for saving parameter in local
-    MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
-  };
-
-  void enableSparseParameter() {
-    if (config_.is_sparse()) {
-      if (config_.format() == "csr") {
-        size_t height = config_.dims(0);
-        size_t nnz = config_.size();
-        enableIntType(PARAMETER_ROWS, height + 1);
-        enableIntType(PARAMETER_COLS, nnz);
-        format_ = SPARSE_CSR;
-      } else {
-        size_t width = config_.dims(1);
-        size_t nnz = config_.size();
-        enableIntType(PARAMETER_COLS, width + 1);
-        enableIntType(PARAMETER_ROWS, nnz);
-        format_ = SPARSE_CSC;
-      }
-    }
-  }
-
-  /// allocate buffer for the give type
-  void enableType(ParameterType type, MatType matType = MAT_NORMAL) {
-    if (bufs_[type] || mats_[type]) {
-      return;
-    }
-    SetDevice device(deviceId_);
-    if (config_.dims_size() == 2) {
-      if (matType == MAT_NORMAL || matType == MAT_NORMAL_SHARED ||
-          matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
-          matType == MAT_VALUE_SHARED || matType == MAT_SPARSE_ROW_IDS) {
-        bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-        bufs_[type]->zeroMem();
-      } else {
-        CHECK(isGradSparseUpdate());
-      }
-      if (config_.is_sparse() && type == PARAMETER_VALUE) {
-        enableSparseParameter();
-      }
-      setMat(type, matType);
-    } else {
-      bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-      bufs_[type]->zeroMem();
-    }
-  }
-
-  void enableBufType(ParameterType type) {
-    if (bufs_[type]) return;
-    bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-    bufs_[type]->zeroMem();
-  }
-
-  void enableIntType(ParameterType type, size_t intStoreSize = 0) {
-    if (!intBufs_[type]) {
-      SetDevice device(deviceId_);
-      size_t size = intStoreSize ? intStoreSize : config_.size();
-      intBufs_[type] = IVector::create(size, useGpu_);
-      intBufs_[type]->zeroMem();
-    }
-  }
-
-  void enableSharedType(ParameterType type,
-                        VectorPtr vec,
-                        MatrixPtr mat = nullptr) {
-    if (!bufs_[type] && !mats_[type]) {
-      bufs_[type] = vec;
-      mats_[type] = mat;
-    }
-  }
-
-  /// for batchGradientMachine: blockNum is number of partitions of the matrix.
-  bool isGradShared(size_t* blockNum = NULL);
-
-  bool isValueShared();
-
-  // for AsgdSparseGradientMachine & SgdSparseGradientMachine:
-  // and MultiGradientMachine
-  bool isGradSparseUpdate() const;
-
-  bool isSparseRemoteUpdate() const {
-    return config_.sparse_remote_update() && !useGpu();
-  }
-
-  const ParameterConfig& getConfig() const { return config_; }
-
-  ParameterConfig& getConfig() { return config_; }
-
-  bool hasType(ParameterType pType) const {
-    return bufs_[pType] || mats_[pType];
-  }
-
-  const VectorPtr& getBuf(ParameterType pType) const {
-    return this->bufs_[pType];
-  }
-
-  const VectorPtr* getBufs() const { return bufs_; }
-
-  const MatrixPtr& getMat(ParameterType pType) const { return mats_[pType]; }
-
-  void setValueUpdated() { updated_ = true; }
-
-  void clearValueUpdated() { updated_ = false; }
-
-  bool isValueUpdated() const { return updated_; }
-
-  /**
-   * Save parameter value to a file
-   */
-  bool save(const std::string& filename) const;
-
-  /**
-   * Save parameter to ostream
-   */
-  bool save(std::ostream& s) const;
-
-  /**
-   * Load parameter value from a file
-   */
-  bool load(const std::string& filename);
-
-  /**
-   * Load parameter from istream
-   */
-  bool load(std::istream& is);
-
-  void incShared() { sharedCount_++; }
-
-  /**
-   * After one of the parameter's gradient is merged
-   * You should call this function to do some additional processing,
-   */
-  void incUpdate(const UpdateCallback& callbacks = NULL);
-
-  void clearGradient() {
-    auto& mat = getMat(PARAMETER_GRADIENT);
-    if (mat) {
-      // zeroMem will also clear rows for SparseRowCpuMatrix
-      mat->zeroMem();
-    } else {
-      auto& gradBuf = getBuf(PARAMETER_GRADIENT);
-      if (gradBuf) gradBuf->zeroMem();
-    }
-  }
-
-  void initialize();
-
-  /**
-   * Initialize the value according to config_: initial_mean,
-   * initial_std and initial_strategy.
-   */
-  void randomize();
-  static void randomize(const VectorPtr& value, const ParameterConfig& config);
-
-  /// Initialize the value to 0
-  void zeroMem();
-
-  /// file header structure
-  struct Header {
-    int32_t format;      // = PARAM_FORMAT
-    uint32_t valueSize;  // = sizeof(real)
-    uint64_t size;       // = getSize()
-  };
-
-  /**
-   * @brief Is the header format supported.
-   */
-  static bool isHeaderFormatSupported(int32_t fmt) {
-    return fmt < PARAM_FORMAT_ITEMS;
-  }
-
-  /**
-   * @brief Get the format in header.
-   */
-  int getHeaderFormat() { return headerFormat_; }
-
-  /**
-   * @brief Set the format in header.
-   */
-  void setHeaderFormat(int32_t fmt) {
-    CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
-                                        << fmt;
-    headerFormat_ = fmt;
-  }
-
-  /**
-   * @brief  Parameter Update Hook.
-   *
-   * The parameter's update hook before ParameterUpdater::updateImpl
-   * It could modify gradient/momentum/etc here. Such as drop some gradient,
-   * etc.
-   */
-  void updateHook() {
-    for (auto& hook : updaterHooks_) {
-      hook->update(this);
-    }
-  }
-
-  /**
-   * @brief  Initialize all updater hook.
-   *
-   * This method should be invoked in ParameterUpdater::init() only.
-   */
-  void initHook() {
-    for (auto& hook : updaterHooks_) {
-      hook->init(this);
-    }
-  }
-
- protected:
-  /**
-   * @brief create matrix to matType.
-   *
-   * used by gradient machine which needs specify matrix type,
-   * instead of creating in weights.cpp.
-   *
-   * @note  pType should be enabled already.
-   */
-  void setMat(ParameterType pType, int matType);
-
-  bool isUpdatable() { return (updateCounter_ == sharedCount_); }
-
-  void clearUpdate() { updateCounter_ = 0; }
-
- protected:
-  ParameterConfig config_;
-
-  bool useGpu_;
-
-  int deviceId_;
-
-  /**
-   * @brief bufs_ stores parameter value and gradient.
-   *
-   * Layer should use bufs_[PARAMETER_VALUE] to form weight matrix for
-   * calculation and stores gradient to bufs_[PARAMETER_GRADIENT].
-   */
-  VectorPtr bufs_[NUM_PARAMETER_TYPES];
-
-  /**
-   * @brief Weight matrix for bufs_.
-   *
-   * It's helpfull when parameter shared by multi-layers.
-   * Caller should check, if mats exist, do not create it again.
-   */
-  MatrixPtr mats_[NUM_PARAMETER_TYPES];
-
-  /// Int vectors, used in some User defined parameter types
-  IVectorPtr intBufs_[NUM_PARAMETER_TYPES];
-
-  int sharedCount_;
-  int updateCounter_;
-
-  bool updated_;
-  SparseFormat format_;
-
-  /// The header format for saving or loading param
-  int32_t headerFormat_;
-
-  std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
-
- public:
-  void setSharedCount(int cnt) { sharedCount_ = cnt; }
-  int getSharedCount() { return sharedCount_; }
-
-  bool isSparse() { return config_.is_sparse(); }
-  SparseFormat getFormat() { return format_; }
-
-  static const std::string kMissParameterFail;
-  static const std::string kMissParameterRand;
-  static const std::string kMissParameterZero;
-};
-
-typedef std::map<std::string, ParameterPtr> ParameterMap;
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h
deleted file mode 100644
index 7434baa2d3d6297cc6d8d99b46cff516d6ac49f9..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterUpdateFunctions.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Common.h"
-
-namespace paddle {
-
-/**
- * Performs the following operations.
- *
- * momentumVec = momentum * momentumVec
- *               - learningRate * grad
- *               - learningRate * decayRate * value
- *
- * value = value + momentumVec
- * momentum = 0 or decayRate = 0 are specially handled to avoid unnecessary
- * computation.
- */
-void sgdUpdate(real learningRate,
-               real momentum,
-               real decayRate,
-               Vector* value,
-               Vector* grad,
-               Vector* momentumVec);
-
-void sgdUpdateCpu(real learningRate,
-                  real momentum,
-                  real decayRate,
-                  size_t size,
-                  real* value,
-                  const real* grad,
-                  real* momentumVec);
-
-void sgdUpdateAvx(float learningRate,
-                  float momentum,
-                  float decayRate,
-                  size_t size,
-                  float* value,
-                  const float* grad,
-                  float* momentumVec);
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
deleted file mode 100644
index 989185b66a5b7785bb0572fba59a72adeef9797b..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterUpdaterHook.h"
-
-#include <algorithm>
-#include <atomic>
-#include <fstream>
-#include <mutex>
-#include <thread>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-/**
- * The static pruning hook
- * Static means user specify a sparsity_ratio before training started, and the
- * network will prune the parameters based on the sparsity_ratio. More details
- * can be found https://arxiv.org/pdf/1506.02626.pdf.
- */
-
-class StaticPruningHook : public IParameterUpdaterHook {
- public:
-  explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig)
-      : initCount_(0) {
-    sparsityRatio_ = hookConfig.sparsity_ratio();
-  }
-
-  static bool sortPairAscend(const std::pair<real, size_t> &pair1,
-                             const std::pair<real, size_t> &pair2) {
-    return pair1.first > pair2.first;
-  }
-
-  void update(Parameter *para) {
-    updateThreadChecker_.check();
-    auto &vec = para->getBuf(PARAMETER_GRADIENT);
-    if (vec) {
-      vec->dotMul(*maskVec_);
-    }
-  }
-
-  void generateMask(Parameter *para) {
-    VectorPtr maskTemp = Vector::create(para->getSize(), false);
-    maskTemp->zeroMem();
-    real *maskTempData = maskTemp->getData();
-    size_t nonZeroNum = para->getSize() * (1 - sparsityRatio_);
-
-    VectorPtr paraVec = para->getBuf(PARAMETER_VALUE);
-    VectorPtr paraCpuCopy = Vector::create(para->getSize(), false);
-
-    paraCpuCopy->copyFrom(*paraVec);
-    std::vector<std::pair<real, size_t>> param;
-
-    for (size_t i = 0; i < para->getSize(); i++)
-      param.push_back(std::make_pair(fabs(paraCpuCopy->getData()[i]), i));
-
-    std::partial_sort(
-        param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend);
-    for (size_t i = 0; i < nonZeroNum; i++) maskTempData[param[i].second] = 1.0;
-
-    // Currently just use a mask vector for hack.
-    if (para->useGpu()) {
-      maskVec_ = Vector::create(para->getSize(), para->useGpu());
-      maskVec_->copyFrom(*maskTemp);
-    } else {
-      maskVec_ = maskTemp;
-    }
-  }
-
-  void init(Parameter *para) {
-    generateMask(para);
-    size_t initCount = this->initCount_.fetch_add(1);
-    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
-                                "in same ParamterUpdater";
-    VLOG(3) << "Initialize Parameter " << para;
-    SetDevice device(para->getDeviceId());
-
-    auto &paraVec = para->getBuf(PARAMETER_VALUE);
-    paraVec->dotMul(*maskVec_);
-  }
-
- private:
-  SameThreadChecker updateThreadChecker_;
-  std::atomic<size_t> initCount_;
-  VectorPtr maskVec_;
-  real sparsityRatio_;
-};
-
-IParameterUpdaterHook::IParameterUpdaterHook() {}
-
-IParameterUpdaterHook::~IParameterUpdaterHook() {}
-
-/**
- * A Hasher used by g_hooks.
- *
- * Use the independent hasher intendedly. There is a hasher in PServer for hash
- * ParameterBlock. But not to use same hasher to reduce dependency.
- *
- * May be extracted to Util.h to unify the hasher.
- */
-class StringIntPairHasher {
- public:
-  size_t operator()(const std::pair<std::string, int> &k) const {
-    return intHasher_(strHasher_(k.first) + k.second);
-  }
-
- private:
-  std::hash<std::string> strHasher_;
-  std::hash<int> intHasher_;
-};
-
-static WeakKVCache<std::pair<std::string, int>,
-                   IParameterUpdaterHook,
-                   StringIntPairHasher>
-    g_hookCache_;
-
-/**
- * ParameterUpdaterHook actually factory method.
- */
-static IParameterUpdaterHook *createImpl(
-    const ParameterUpdaterHookConfig &config) {
-  auto &type = config.type();
-  if (type == "pruning") {
-    return new StaticPruningHook(config);
-  }
-
-  LOG(FATAL) << "Unknown Hook type:  " << type;
-  return nullptr;
-}
-
-std::shared_ptr<IParameterUpdaterHook> IParameterUpdaterHook::create(
-    const ParameterConfig &paramConfig, int idx) {
-  std::pair<std::string, int> key = {paramConfig.name(), idx};
-  return g_hookCache_.get(
-      key, [&] { return createImpl(paramConfig.update_hooks(idx)); });
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/ThreadLocalBuffer.h b/paddle/parameter/ThreadLocalBuffer.h
deleted file mode 100644
index 07c96e59d0bc0a58ce9956a54e7de359896e5618..0000000000000000000000000000000000000000
--- a/paddle/parameter/ThreadLocalBuffer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-namespace parameter {
-extern VectorPtr* getThreadLocalBuffer();
-}  // namespace parameter
-}  // namespace paddle
diff --git a/paddle/parameter/Weight.h b/paddle/parameter/Weight.h
deleted file mode 100644
index 113dd6530c82fe1e831ad4a35e9cbcb9880b9243..0000000000000000000000000000000000000000
--- a/paddle/parameter/Weight.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <vector>
-
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseRowMatrix.h"
-#include "paddle/parameter/Parameter.h"
-
-namespace paddle {
-
-class Weight {
- private:
-  MatrixPtr weight_;
-  MatrixPtr weightGrad_;
-  ParameterPtr parameter_;
-
- public:
-  Weight(size_t height, size_t width, ParameterPtr parameter);
-  Weight(size_t height, size_t width, ParameterPtr parameter, size_t offset);
-
-  const MatrixPtr& getW() { return weight_; }
-  const MatrixPtr& getWGrad() { return weightGrad_; }
-  const ParameterPtr& getParameterPtr();
-
-  void incUpdate(const UpdateCallback& callback) {
-    getParameterPtr()->incUpdate(callback);
-  }
-
-  void setParameterPtr(ParameterPtr param);
-};
-
-typedef std::vector<std::unique_ptr<Weight>> WeightList;
-
-}  // namespace paddle
diff --git a/paddle/parameter/tests/test_argument.cpp b/paddle/parameter/tests/test_argument.cpp
deleted file mode 100644
index 54ceb3e08714e37abb5d491c8973bee631b993be..0000000000000000000000000000000000000000
--- a/paddle/parameter/tests/test_argument.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/parameter/Argument.h>
-
-using namespace paddle;  // NOLINT
-
-TEST(Argument, poolSequenceWithStride) {
-  Argument input, output;
-  ICpuGpuVector::resizeOrCreate(input.sequenceStartPositions, 5, false);
-  int* inStart = input.sequenceStartPositions->getMutableData(false);
-  inStart[0] = 0;
-  inStart[1] = 9;
-  inStart[2] = 14;
-  inStart[3] = 17;
-  inStart[4] = 30;
-
-  int strideResult[] = {0, 5, 9, 14, 17, 22, 27, 30};
-  int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
-
-  for (auto reversed : {false, true}) {
-    ICpuGpuVectorPtr stridePositions;
-    output.poolSequenceWithStride(
-        input, 5 /* stride */, &stridePositions, reversed);
-
-    const int* outStart = output.sequenceStartPositions->getData(false);
-    CHECK_EQ(outStart[0], 0);
-    CHECK_EQ(outStart[1], 2);
-    CHECK_EQ(outStart[2], 3);
-    CHECK_EQ(outStart[3], 4);
-    CHECK_EQ(outStart[4], 7);
-
-    CHECK_EQ(stridePositions->getSize(), 8UL);
-    auto result = reversed ? strideResultReversed : strideResult;
-    for (int i = 0; i < 8; i++) {
-      CHECK_EQ(stridePositions->getData(false)[i], result[i]);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
deleted file mode 100644
index 89dcc6c751eb2ec07bfe8297c93d56c824086211..0000000000000000000000000000000000000000
--- a/paddle/parameter/tests/test_common.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/Util.h>
-#include <stdlib.h>
-
-#include <gtest/gtest.h>
-#include <paddle/parameter/ParameterUpdateFunctions.h>
-#include <paddle/utils/Flags.h>
-#include <paddle/utils/Stat.h>
-#include <paddle/utils/Thread.h>
-
-using namespace paddle;  // NOLINT
-
-class CommonTest : public ::testing::Test {
- protected:
-  CommonTest() : testStat_("test") {}
-  virtual ~CommonTest() {}
-  virtual void SetUp() {
-    const size_t buffSize[] = {
-        100, 128, 500, 1024, 4096, 10240, 102400, 1000000};
-    sizeVec_.resize(8);
-    memcpy(&sizeVec_[0], &buffSize[0], 8 * sizeof(size_t));
-    valueUint_.resize(4);
-    valueUint_[0].first = 0.0;
-    valueUint_[0].second = 0.0;
-    valueUint_[1].first = 0.0;
-    valueUint_[1].second = 1.0;
-    valueUint_[2].first = 1.0;
-    valueUint_[2].second = 0.0;
-    valueUint_[3].first = 1.0;
-    valueUint_[3].second = 1.0;
-    learningRate_ = 1.0;
-  }
-
-  void test_sgdUpadate(real* gradientBuffer,
-                       real* valueBuffer,
-                       real* momentumBuffer,
-                       size_t size);
-
-  virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
-
- protected:
-  std::vector<std::pair<real, real>> valueUint_;
-  std::vector<size_t> sizeVec_;
-  real learningRate_;
-  StatSet testStat_;
-};
-
-void CommonTest::test_sgdUpadate(real* gradientBuffer,
-                                 real* valueBuffer,
-                                 real* momentumBuffer,
-                                 size_t size) {
-// sgdUpdateAvx has no double version yet
-#if defined(__AVX__) && !defined(PADDLE_TYPE_DOUBLE)
-  real valueSum1 = 0, valueSum2 = 0, momSum1 = 0, momSum2 = 0;
-  real* gradTmp = new real[size];
-  real* valueTmp = new real[size];
-  real* momentumTmp = new real[size];
-  memcpy(gradTmp, gradientBuffer, size * sizeof(real));
-  memcpy(valueTmp, valueBuffer, size * sizeof(real));
-  memcpy(momentumTmp, momentumBuffer, size * sizeof(real));
-  for (auto& arg : valueUint_) {
-    {
-      {
-        struct timeval t;
-        REGISTER_TIMER("gettimeofday", 0, testStat_);
-        gettimeofday(&t, NULL);
-      }
-      REGISTER_TIMER("avxTimer", 0);
-      sgdUpdateAvx(learningRate_,
-                   arg.first,
-                   arg.second,
-                   size,
-                   valueBuffer,
-                   gradientBuffer,
-                   momentumBuffer);
-    }
-    for (size_t i = 0; i < size; i++) {
-      valueSum1 += valueBuffer[i];
-      momSum1 += momentumBuffer[i];
-      // std::cout << "["
-      //          << valueBuffer[i]
-      //          << "," << momentumBuffer[i]
-      //          << "," << gradientBuffer[i] << "],";
-    }
-    {
-      REGISTER_TIMER("cpuTimer", 0);
-      sgdUpdateCpu(learningRate_,
-                   arg.first,
-                   arg.second,
-                   size,
-                   valueTmp,
-                   gradTmp,
-                   momentumTmp);
-    }
-    for (size_t i = 0; i < size; i++) {
-      valueSum2 += valueTmp[i];
-      momSum2 += momentumTmp[i];
-      // std::cout << "["
-      //          << valueTmp[i]
-      //          << "," << momentumTmp[i]
-      //          << "," << gradTmp[i] << "],";
-    }
-
-    VLOG(3) << "valueSum1 = " << valueSum1 << " ; valueSum2 = " << valueSum2;
-    VLOG(3) << "momSum1 = " << momSum1 << " ; momSum2 = " << momSum2;
-    ASSERT_EQ(valueSum1, valueSum2);
-    ASSERT_EQ(momSum1, momSum2);
-  }
-  delete[] gradTmp;
-  delete[] valueTmp;
-  delete[] momentumTmp;
-#endif
-}
-
-TEST_F(CommonTest, sgdUpdate) {
-  const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
-  for (auto& size : sizeVec_) {
-    real *gradientBuffer, *valueBuffer, *momentumBuffer;
-    CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
-             0);
-    CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
-    CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
-             0);
-
-    for (size_t i = 0; i < size; i++) {
-      gradientBuffer[i] = 1.0;
-      valueBuffer[i] = 2.0;
-      momentumBuffer[i] = 3.0;
-    }
-    for (int i = 0; i < 6; i++) {
-      LOG(INFO) << "----------------------" << size << ":" << alignHeader[i]
-                << "-------------------------";
-      test_sgdUpadate(&gradientBuffer[alignHeader[i]],
-                      &valueBuffer[alignHeader[i]],
-                      &momentumBuffer[alignHeader[i]],
-                      size - alignHeader[i]);
-    }
-    free(gradientBuffer);
-    free(valueBuffer);
-    free(momentumBuffer);
-  }
-  globalStat.printAllStatus();
-  testStat_.printAllStatus();
-}
-
-TEST_F(CommonTest, syncThreadPool) {
-  SyncThreadPool pool(10);
-
-  std::vector<int> nums;
-  nums.resize(10);
-
-  pool.exec([&](int tid, size_t numThreads) { nums[tid] = tid; });
-  for (size_t i = 0; i < nums.size(); ++i) {
-    EXPECT_EQ((int)i, nums[i]);
-  }
-
-  pool.exec([&](int tid, size_t numThreads) { nums[tid] -= tid; });
-  for (size_t i = 0; i < nums.size(); ++i) {
-    EXPECT_EQ((int)0, nums[i]);
-  }
-}
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
deleted file mode 100644
index d50230e73a3a7d128cbfd1d70517fddd228fb1bb..0000000000000000000000000000000000000000
--- a/paddle/pserver/BaseClient.h
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterService.pb.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/pserver/ProtoServer.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Queue.h"
-
-namespace paddle {
-
-/**
- * it manages all connections to pservers.
- * it exists two modes to manage connections to all pservers. Firstly, one
- * connection owns two threads that separately manage to send and receive
- * data. Secondly, each thread uses one connection for all activation in it.
- * the first solution arms with sendThreads_/recvThreads_ and sendJobQueue_/
- * recvJobQueue_. the second solution use some shared thread pool to manage
- * connections.
- */
-class BaseClient {
- protected:
-  typedef std::unique_ptr<std::thread> ThreadPtr;
-  typedef std::vector<std::vector<iovec>> InputIovs;
-  typedef std::vector<SendParameterRequest> SendRequest;
-  typedef std::vector<SendDataRequest> SendDataRequestVec;
-
-  // TODO(yanfei):
-  // refine data structure to unify parameter and features communication
-  struct SendJob {
-    /// store parameters related blocks data
-    InputIovs parallelInputIovs;
-    /// store protobuf request
-    SendRequest parallelRequests;
-    /// store data, such as features for metric learning
-    SendDataRequestVec parallelDataRequests;
-  };
-
- public:
-  explicit BaseClient(bool separate = false, int numPorts = FLAGS_ports_num);
-
-  virtual ~BaseClient();
-
-  typedef std::shared_ptr<SendJob> SendJobPtr;
-  typedef Queue<SendJobPtr> SendQueue;
-
-  /// send data to server, support only synchronize
-  template <class DataType>
-  void putData(int clientId,
-               SendDataType type,
-               DataType* datas,
-               size_t size,
-               DataUpdateMode mode) {
-    synchronize(SYNC_DATA);
-    sendData(clientId, type, mode, datas, size);
-    recvData();
-    synchronize(SYNC_DATA);
-  }
-
-  template <class DataType>
-  void putOwnData(int clientId,
-                  SendDataType type,
-                  DataType* datas,
-                  size_t size) {
-    putData(clientId, type, datas, size, DATA_UPDATE_MODE_SET_OWN);
-  }
-
-  template <class DataType>
-  void getAllData(int clientId,
-                  SendDataType type,
-                  DataType* datas,
-                  size_t size) {
-    sendData(clientId,
-             type,
-             DATA_UPDATE_MODE_GET_ALL,
-             reinterpret_cast<DataType*>(NULL),
-             0);
-    recvData();
-    size_t dataOffset = 0;
-    for (auto& recvMem : recvDataMems_) {
-      CHECK_LE(dataOffset, size);
-      size_t memSize = std::min(recvMem.get()->getSize(),
-                                sizeof(DataType) * (size - dataOffset));
-      CHECK_EQ(memSize % sizeof(DataType), size_t(0));
-      memcpy(datas + dataOffset, recvMem.get()->getBuf(), memSize);
-      dataOffset += memSize / sizeof(DataType);
-    }
-    CHECK_EQ(dataOffset, size);
-  }
-
-  /**
-   * Reduces values on all clients.
-   * This reduce just support SUM.
-   * The results are saved in recvBuf of rootId client
-   */
-  template <class DataType>
-  void reduce(DataType* sendBuf,
-              DataType* recvBuf,
-              size_t size,
-              int clientId,
-              int rootId) {
-    putOwnData(clientId, DATA_REDUCE_SUM, sendBuf, size);
-    if (rootId == clientId) {
-      getAllData(clientId, DATA_REDUCE_SUM, recvBuf, size);
-    }
-  }
-
-  /**
-   * return trans data type according to the input type
-   */
-  virtual TransDataType getTransDtype(const std::type_info& info) {
-    TransDataType dataType;
-    if (typeid(int*) == info) {  // NOLINT
-      dataType = TRANS_INT32;
-    } else if (typeid(uint32_t*) == info) {  // NOLINT
-      dataType = TRANS_UINT32_T;
-    } else if (typeid(int64_t*) == info) {  // NOLINT
-      dataType = TRANS_INT64_T;
-    } else if (typeid(uint64_t*) == info) {  // NOLINT
-      dataType = TRANS_UINT64_T;
-    } else if (typeid(float*) == info) {  // NOLINT
-      dataType = TRANS_FLOAT;
-    } else if (typeid(double*) == info) {  // NOLINT
-      dataType = TRANS_DOUBLE;
-    } else {
-      LOG(FATAL) << "not supported";
-    }
-    return dataType;
-  }
-
- protected:
-  /// for a > 0, b > 0:
-  /// return the smallest x s.t. b*x >= a
-  static int divup(int a, int b) { return (a + b - 1) / b; }
-
-  int calcClientId(int i, int serviceNum) {
-    return (i + FLAGS_trainer_id * numPorts_) % serviceNum;
-  }
-
-  /// start threads in sendThreads_ and recvThreads_
-  void startThreads();
-
-  /// finish threads in sendThreads_ and recvThreads_
-  void finishThreads();
-
-  template <class DataType>
-  void prepareData(int clientId,
-                   SendDataType type,
-                   DataUpdateMode updateMode,
-                   DataType* datas,
-                   size_t size,
-                   SendJob* sendJob) {
-    sendJob->parallelDataRequests.resize(serviceNum_);
-    sendJob->parallelInputIovs.resize(serviceNum_);
-    for (int i = 0; i < serviceNum_; ++i) {
-      auto& request = sendJob->parallelDataRequests[i];
-      request.set_update_mode(updateMode);
-      request.set_type(type);
-      request.set_client_id(clientId);
-      request.set_server_id(i);
-    }
-
-    /// split datas which need send to Server into serviceNum_ pieces
-    if (!datas) {
-      CHECK(!size) << "ownSize should be zero since datas is nullptr";
-    }
-    size_t baseSize = size / serviceNum_;
-    size_t dataOffset = 0;
-    for (int i = 0; i < serviceNum_; ++i) {
-      auto& request = sendJob->parallelDataRequests[i];
-      DataBlock* block = request.add_blocks();
-      size_t ownSize = size_t(i) < size % serviceNum_ ? baseSize + 1 : baseSize;
-      size_t realSize = datas ? std::max(ownSize, size_t(1)) : 0;
-      block->set_total_size(realSize * sizeof(DataType));
-      block->set_data_size(sizeof(DataType));
-      // TODO(yuyang18): The getTransDtype can be rewritten as template method
-      //                 to reduce runtime overhead.
-      block->set_data_type(getTransDtype(typeid(DataType*)));  // NOLINT
-      if (datas) {
-        sendJob->parallelInputIovs[i].push_back(
-            {datas + dataOffset, realSize * sizeof(DataType)});
-      }
-      dataOffset += ownSize;
-    }
-    CHECK_EQ(dataOffset, size);
-  }
-
-  /**
-   * @brief send data to all data servers
-   *
-   * @note  each trainer sends all its data to all data servers
-   *        it's for broadcast data synchronization, such as features
-   *        synchronization in metric learning.
-   */
-  template <class DataType>
-  void sendData(int clientId,
-                SendDataType type,
-                DataUpdateMode updateMode,
-                DataType* datas,
-                size_t size) {
-    SendJobPtr sendJob = std::make_shared<SendJob>();
-    prepareData(clientId, type, updateMode, datas, size, sendJob.get());
-    for (int i = 0; i < threadNum_; ++i) {
-      sendJobQueue_[i]->enqueue(sendJob);
-    }
-  }
-
-  /**
-   * @brief recv data from all data servers
-   *
-   * @note  synchronize all recv threads
-   */
-  void recvData();
-
-  /// send request, and recv responses
-  template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName,
-                 const ProtoIn& request,
-                 std::vector<ProtoOut>* responses) {
-    responses->resize(clients_.size());
-    size_t numClients = clients_.size();
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].send(funcName, request);
-    }
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].recv(&(*responses)[i]);
-    }
-  }
-
-  /**
-   * @brief synchronize all trainers and pservers
-   *
-   * @note  used to ensure that data of all trainers have been received
-   */
-  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  /**
-   * @brief use multithread to separately send data
-   *
-   * @note  each thread should read its own JobQueue to handle requests
-   *        each thread should calcClientId() to retrieve connections
-   *        managed by himself.
-   *        send and recv are implemented in child class.
-   */
-  virtual void send(int threadId) = 0;
-
-  /**
-   * @brief use multithread to separately receive data
-   *
-   * @note  almost same as send()
-   */
-  virtual void recv(int threadId) = 0;
-
- protected:
-  bool stopping_;
-  /// nodes * ports that means the number of real pservers
-  int serviceNum_;
-  /**
-   * threads num for managing all services. Normally the
-   * number of pservers are relatively less than several
-   * hundreds so that using thread-based parallelization
-   * can benifit traffic performance and pserver's sgd
-   * optimization performance.
-   */
-  int threadNum_;
-  /// the connection manager at client end
-  std::vector<ProtoClient> clients_;
-  /// send threads for parallelization
-  std::vector<ThreadPtr> sendThreads_;
-  /// recv threads for parallelization
-  std::vector<ThreadPtr> recvThreads_;
-  std::unique_ptr<ThreadBarrier> recvSyncBarrier_;
-
-  // TODO(yanfei):
-  // current pserver's will return value until all parameters'
-  // optimization are finished so that recv are not overlapped
-  // in reality. More robust implimentation should be to pipeline
-  // all send/recv action based on parameter unit level, and
-  // it will benifits deep and larger model training in future,
-  // especially local node compution power surpasses inter-connection
-  // such as GPU cluster, even with BOX GPU cluster.
-  // queue for buffering send request
-  /**
-   * send/recv queue cooperates with each other to accomplish
-   * overlapping communication with forwardBackward action.
-   */
-  std::vector<std::unique_ptr<SendQueue>> sendJobQueue_;
-  /// queue for buffering recv request
-  std::vector<std::unique_ptr<SendQueue>> recvJobQueue_;
-  /// specific for dserver
-  SendJob sendJob_;
-  /// port num for each node
-  int numPorts_;
-  /// if set, overlapped optimization is disabled
-  bool separateSendAndRecv_;
-  std::vector<CpuMemHandlePtr> recvDataMems_;
-};
-}  // namespace paddle
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
deleted file mode 100644
index 43e4902b0f0f73840624041f19ba7f4eb9a45844..0000000000000000000000000000000000000000
--- a/paddle/pserver/ParameterClient2.cpp
+++ /dev/null
@@ -1,781 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-
-#include "ParameterClient2.h"
-#include "paddle/math/SparseRowMatrix.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
-
-DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
-DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
-
-namespace paddle {
-
-template <typename T1, typename T2>
-void copyToRepeatedField(google::protobuf::RepeatedField<T1>* dest,
-                         const T2* src,
-                         size_t size) {
-  dest->Clear();
-  dest->Reserve(size);
-  for (size_t i = 0; i < size; ++i) {
-    dest->AddAlreadyReserved(src[i]);
-  }
-}
-
-ParameterClient2::ParameterClient2(bool separate, int port, int numPorts)
-    : BaseClient(separate, numPorts), port_(port) {
-#ifndef PADDLE_DISABLE_TIMER
-  forwardbackwordTime_ = 0;
-#endif
-}
-
-int ParameterClient2::calcParameterBlockSize(
-    const std::vector<ParameterPtr>& parameters, size_t serviceNum) {
-  size_t totalSize = 0;
-  for (auto& para : parameters) {
-    totalSize += para->getSize();
-  }
-  size_t perServerSize = totalSize / serviceNum;
-
-  int sizeBits = 64 - __builtin_clzl(perServerSize);
-
-  /// 2^10 is min block size
-  /// 2^7 will be max number of blocks in one pserver
-  int blockSizeBits = std::max((sizeBits - 7), 10);
-  return 1 << blockSizeBits;
-}
-
-void ParameterClient2::initThreads() {
-  threadNum_ = serviceNum_;
-  if (FLAGS_parallel_thread_num > 1) {
-    LOG(INFO) << "parallel_thread_num dosent need to set";
-  }
-  syncThreadPool_.reset(new SyncThreadPool(threadNum_));
-  startThreads();
-}
-
-bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
-  destroy();
-
-  std::vector<std::string> hosts;
-  str::split(FLAGS_pservers, ',', &hosts);
-  serviceNum_ = hosts.size() * numPorts_;
-  uint64_t denseBlockSize = calcParameterBlockSize(parameters, serviceNum_);
-
-  /// setup prefetch matrix if exists
-  for (auto& para : parameters) {
-    /// set block size for each parameter
-    para->getConfig().set_parameter_block_size(
-        para->getConfig().sparse_remote_update() ? para->getConfig().dims(1)
-                                                 : denseBlockSize);
-  }
-
-  for (auto& para : parameters) {
-    CHECK_NE(-1UL, para->getID()) << "id in parameter is not initialized";
-    parameterMap_[para->getID()] = para;
-  }
-
-  allSegments_.reserve(parameters.size());
-
-  for (auto& para : parameters) {
-    ParameterSegments segments;
-    segments.name = para->getName();
-    segments.id = para->getID();
-    allSegments_.push_back(segments);
-    if (para->getConfig().sparse_remote_update()) {
-      CHECK_EQ(para->getConfig().parameter_block_size(),
-               para->getConfig().dims(1))
-          << "For sparse remote update parameter,"
-          << " block size is the width of each row.";
-    }
-  }
-
-  /// init clients
-  clients_.reserve(serviceNum_);
-  recvDataMems_.resize(serviceNum_);
-
-  for (size_t i = 0; i < hosts.size(); ++i) {
-    for (int j = 0; j < numPorts_; ++j) {
-      LOG(INFO) << "pserver " << i * numPorts_ + j << " " << hosts[i] << ":"
-                << port_ + j;
-      if (FLAGS_rdma_tcp == "rdma") {
-        clients_.emplace_back(hosts[i], port_ + j, F_RDMA);
-      } else {
-        clients_.emplace_back(hosts[i], port_ + j, F_TCP);
-      }
-    }
-  }
-
-  sparseDistribution_.reset(new SparseParameterDistribution(serviceNum_));
-
-  sleep(2);
-
-  initThreads();
-
-  return true;
-}
-
-ParameterClient2::~ParameterClient2() { destroy(); }
-
-void ParameterClient2::destroy() {
-  if (clients_.empty()) {
-    /// this means not initialized.
-    return;
-  }
-  finishThreads();
-
-  parameterMap_.clear();
-  allSegments_.clear();
-  clients_.clear();
-}
-
-void ParameterClient2::sendParallel(int tid,
-                                    size_t numThreads,
-                                    ParameterType recvParameterType) {
-  int numMyClients = divup(serviceNum_ - tid, numThreads);
-
-  for (int j = 0; j < numMyClients; ++j) {
-    REGISTER_TIMER("client_sendAndRecv_send");
-    int i = numThreads * j + tid;
-    /// Try to make different clients to send data to different pservers
-    /// at the same time so that they will not flood data to the same
-    /// pserver.
-    i = calcClientId(i, serviceNum_);
-    clients_[i].send("sendParameter",
-                     sendJob_.parallelRequests[i],
-                     sendJob_.parallelInputIovs[i]);
-
-    /// clear large structure
-    sendJob_.parallelRequests[i].Clear();
-    sendJob_.parallelInputIovs[i].clear();
-  }
-
-  std::vector<void*> bufs;
-  SendParameterResponse response;
-  for (int j = 0; j < numMyClients; ++j) {
-    REGISTER_TIMER("client_sendAndRecv_recv");
-    int i = numThreads * j + tid;
-    i = calcClientId(i, serviceNum_);
-    auto msgReader = clients_[i].recv(&response);
-    CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
-    bufs.clear();
-    bufs.reserve(response.blocks_size());
-    for (auto& block : response.blocks()) {
-      auto it = parameterMap_.find(block.para_id());
-      CHECK(it != parameterMap_.end());
-      Parameter* parameter = it->second.get();
-      real* buf = nullptr;
-      if (parameter->getBuf(recvParameterType)) {
-        buf = parameter->getBuf(recvParameterType)->getPoint(block.begin_pos());
-      } else {
-        auto recvMat = dynamic_cast<SparseRowCpuMatrix*>(
-            parameter->getMat(recvParameterType).get());
-        CHECK(recvMat);
-        size_t width = parameter->getConfig().dims(1);
-        // TODO(wuyi): need add lock here? may also cause resize.
-        buf = recvMat->getLocalRow(block.begin_pos() / width);
-      }
-      /// sparse_id is not useful while receiving data since sparse data
-      /// storage is continuous, do commit recieved data as that of dense.
-      bufs.push_back(buf);
-    }
-    msgReader->readBlocks(bufs);
-  }
-}
-
-void ParameterClient2::prepareSendData(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    ParameterType sendBackParameterType,
-    BatchStatus batchStatus,
-    SendJob* sendJob) {
-  sendJob->parallelRequests.resize(serviceNum_);
-  sendJob->parallelInputIovs.resize(serviceNum_);
-
-  for (auto& request : sendJob->parallelRequests) {
-#ifndef PADDLE_DISABLE_TIMER
-    if (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT) {
-      request.set_forwardbackward_time(forwardbackwordTime_);
-    }
-#endif
-    request.set_trainer_id(trainerId_);
-    request.set_update_mode(updateMode);
-    request.set_send_back_parameter(sendBackParameter);
-    request.set_send_back_parameter_type(sendBackParameterType);
-    request.set_num_samples(numSamples);
-    request.set_cost(cost);
-    request.set_batch_status(batchStatus);
-    CHECK_EQ(request.blocks_size(), 0);
-    VLOG(10) << "request: trainer_id: " << request.trainer_id()
-             << " update_mode" << request.update_mode()
-             << " send_back_parameter: " << request.send_back_parameter()
-             << " send_back_parameter_type: "
-             << request.send_back_parameter_type()
-             << " num_samples: " << request.num_samples()
-             << " cost: " << request.cost()
-             << " batch_status: " << request.batch_status();
-  }
-  for (const auto& segments : parameterSegments) {
-    const auto it = parameterMap_.find(segments.id);
-    CHECK(it != parameterMap_.end());
-    Parameter* parameter = it->second.get();
-    CHECK(parameter != nullptr) << "parameter is nullptr";
-    int64_t nameHash = std::hash<std::string>()(segments.name);
-    bool sendingPara = !(updateMode == PSERVER_UPDATE_MODE_GET_PARAM ||
-                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE ||
-                         updateMode == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
-    bool sparseUpdate = parameter->getConfig().sparse_remote_update() &&
-                        (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT ||
-                         updateMode == PSERVER_UPDATE_MODE_ASYNC_SGD ||
-                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE);
-
-    const auto blockSize = parameter->getConfig().parameter_block_size();
-    CHECK_GE(blockSize, 1LU) << "blockSize should > 0 " << blockSize;
-    const auto paraSize = parameter->getSize();
-    if (sparseUpdate) {
-      auto prefetchMat = std::dynamic_pointer_cast<SparsePrefetchRowCpuMatrix>(
-          parameter->getMat(PARAMETER_VALUE));
-      CHECK(prefetchMat != nullptr) << "prefetchMat is nullptr";
-      auto sendMat = dynamic_cast<SparseRowCpuMatrix*>(
-          parameter->getMat(parameterType).get());
-      CHECK(sendMat != nullptr) << "sendMat is nullptr";
-
-      syncThreadPool_->exec([&](int tid, size_t numThreads) {
-        std::lock_guard<std::mutex> guard(sparseAutoGrowthMutex_);
-        const auto& localIndices = prefetchMat->getLocalIndices();
-        /// num of sparse rows
-        size_t nLocalBlocks = localIndices.size();
-        uint64_t beginDim = 0;
-        uint64_t endDim = 0;
-
-        // HACK(typhoonzero): let it resize first
-        prefetchMat->getLocalRow(nLocalBlocks);
-        sendMat->getLocalRow(nLocalBlocks);
-
-        for (size_t row = 0; row < nLocalBlocks; ++row) {
-          int64_t blockId = localIndices[row];  // local row -> sparse row
-          int serverId = std::abs((blockId + nameHash) % serviceNum_);
-          if (serverId % numThreads != (size_t)tid) {
-            continue;
-          }
-
-          beginDim = blockId * blockSize;
-          endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
-
-          auto& request = sendJob->parallelRequests[serverId];
-          ParameterBlock* block = request.add_blocks();
-          block->set_para_id(segments.id);
-          /// global sparse row id
-          block->set_block_id(blockId);
-          /// local row offset
-          block->set_begin_pos(row * blockSize);
-          /// block len
-          block->set_block_size(endDim - beginDim);
-          if (sendingPara) {
-            sendJob->parallelInputIovs[serverId].push_back(
-                {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
-            /// detect sparse parameter distribution
-            sparseDistribution_->probeDistribution(serverId,
-                                                   sizeof(real) * blockSize);
-          }
-        }
-      });
-
-    } else {  /// parameter set for dense and sparse
-      real* buf =
-          sendingPara ? parameter->getBuf(parameterType)->getPoint(0) : nullptr;
-      uint64_t endDim = 0;
-      for (uint64_t beginDim = 0; beginDim < paraSize; beginDim = endDim) {
-        endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
-        int64_t blockId = beginDim / blockSize;
-        int serverId = std::abs((blockId + nameHash) % serviceNum_);
-
-        auto& request = sendJob->parallelRequests[serverId];
-        ParameterBlock* block = request.add_blocks();
-        block->set_para_id(segments.id);
-        block->set_block_id(blockId);
-        block->set_begin_pos(beginDim);
-        block->set_block_size(endDim - beginDim);
-        if (buf) {
-          sendJob->parallelInputIovs[serverId].push_back(
-              {buf + beginDim, sizeof(real) * ((size_t)(endDim - beginDim))});
-        }
-      }
-    }
-  }  // parameterSegments
-
-  sparseDistribution_->checkAndResetDistribution();
-}
-
-void ParameterClient2::sendAndReceiveParameter(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    ParameterType sendBackParameterType,
-    ParameterType recvParameterType) {
-  prepareSendData(updateMode,
-                  parameterType,
-                  parameterSegments,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  sendBackParameterType,
-                  /*batchStatus = */ BATCH_START_AND_FINISH,
-                  &sendJob_);
-
-  syncThreadPool_->exec([&](int tid, size_t numThreads) {
-    this->sendParallel(tid, numThreads, recvParameterType);
-  });
-}
-
-void ParameterClient2::sendParameter(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    BatchStatus batchStatus) {
-  SendJobPtr sendJob = std::make_shared<SendJob>();
-  prepareSendData(updateMode,
-                  parameterType,
-                  parameterSegments,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  PARAMETER_VALUE,
-                  batchStatus,
-                  sendJob.get());
-
-  for (int i = 0; i < threadNum_; i++) {
-    sendJobQueue_[i]->enqueue(sendJob);
-  }
-}
-
-void ParameterClient2::recvParameter() { recvSyncBarrier_->wait(); }
-
-void ParameterClient2::send(int threadId) {
-  int index = threadId;
-  LOG(INFO) << "send thread " << threadId << " started";
-  int numMyClients = divup(serviceNum_ - index, threadNum_);
-  while (true) {
-    SendJobPtr recvJob = sendJobQueue_[index]->dequeue();
-    if (stopping_) {
-      recvJobQueue_[index]->enqueue(recvJob);
-      break;
-    }
-    for (int j = 0; j < numMyClients; ++j) {
-      REGISTER_TIMER("client_send");
-      int i = threadNum_ * j + index;
-      /// Try to make different clients to send data to different pservers
-      /// at the same time so that they will not flood data to the same
-      /// pserver.
-      i = calcClientId(i, serviceNum_);
-      if (recvJob->parallelRequests.size()) {
-        clients_[i].send("sendParameter",
-                         recvJob->parallelRequests[i],
-                         recvJob->parallelInputIovs[i]);
-      } else {
-        clients_[i].send("sendData",
-                         recvJob->parallelDataRequests[i],
-                         recvJob->parallelInputIovs[i]);
-      }
-    }
-    recvJobQueue_[index]->enqueue(recvJob);
-  }
-}
-
-void ParameterClient2::recv(int threadId) {
-  LOG(INFO) << "recv thread " << threadId << " started";
-  int index = threadId;
-  int numMyClients = divup(serviceNum_ - index, threadNum_);
-  while (true) {
-    std::vector<void*> bufs;
-    SendParameterResponse response;
-    SendDataResponse dataResponse;
-    SendJobPtr recvJob = recvJobQueue_[index]->dequeue();
-    if (stopping_) break;
-    for (int j = 0; j < numMyClients; ++j) {
-      REGISTER_TIMER("client_recv");
-      int i = threadNum_ * j + index;
-      i = calcClientId(i, serviceNum_);
-      if (recvJob->parallelRequests.size()) {
-        auto msgReader = clients_[i].recv(&response);
-        CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
-        bufs.clear();
-        bufs.reserve(response.blocks_size());
-        for (auto& block : response.blocks()) {
-          auto it = parameterMap_.find(block.para_id());
-          CHECK(it != parameterMap_.end());
-          Parameter* parameter = it->second.get();
-          real* buf =
-              parameter->getBuf(PARAMETER_VALUE)->getPoint(block.begin_pos());
-          CHECK_EQ(msgReader->getBlockLength(bufs.size()),
-                   sizeof(real) * (block.block_size()));
-          bufs.push_back(buf);
-        }
-        msgReader->readBlocks(bufs);
-      } else {
-        auto msgReader = clients_[i].recv(&dataResponse);
-        CHECK_EQ(msgReader->getNumBlocks(), (size_t)dataResponse.blocks_size());
-        size_t totalLen = msgReader->getTotalLength();
-        if (0 == totalLen) {
-          continue;
-        }
-        auto& recvMem = recvDataMems_[dataResponse.server_id()];
-        CHECK_EQ(dataResponse.blocks_size(), 1)
-            << "Only one block currently support now!";
-        auto& block = dataResponse.blocks(0);
-        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
-        recvMem = std::make_shared<CpuMemoryHandle>(totalLen);
-        msgReader->readNextBlock(recvMem.get()->getBuf());
-      }
-    }
-    recvSyncBarrier_->wait();
-  }
-}
-
-void ParameterClient2::waitPassStart() {
-  WaitPassStartRequest request;
-  std::vector<WaitPassStartResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::waitPassFinish() {
-  WaitPassFinishRequest request;
-  std::vector<WaitPassFinishResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::synchronize(SyncObject syncObjectId) {
-  SynchronizeRequest request;
-  request.set_sync_object_id(syncObjectId);
-  std::vector<SynchronizeResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::asyncFinishPass(SyncObject syncObjectId) {
-  SynchronizeRequest request;
-  request.set_sync_object_id(syncObjectId);
-  request.set_trainer_id(trainerId_);
-  std::vector<SynchronizeResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::setConfig(const OptimizationConfig& optConfig,
-                                 const std::string& saveDir,
-                                 bool isSparseServer) {
-  SetConfigRequest request;
-  std::vector<SetConfigResponse> responses;
-
-  for (auto& nameAndPara : parameterMap_) {
-    *request.add_param_configs() = nameAndPara.second->getConfig();
-  }
-
-  *request.mutable_opt_config() = optConfig;
-  request.set_save_dir(saveDir);
-  request.set_is_sparse_server(isSparseServer);
-
-  std::vector<SetConfigRequest> requests;
-  requests.resize(clients_.size());
-  for (size_t i = 0; i < requests.size(); ++i) {
-    requests[i].CopyFrom(request);
-    requests[i].set_server_id(i);
-  }
-
-  responses.resize(clients_.size());
-  size_t numClients = clients_.size();
-  for (size_t i = 0; i < numClients; ++i) {
-    clients_[i].send(__func__, requests[i]);
-  }
-  for (size_t i = 0; i < numClients; ++i) {
-    clients_[i].recv(&responses[i]);
-  }
-}
-
-bool ParameterClient2::inStatus(PServerStatus status) {
-  GetStatusRequest request;
-  std::vector<GetStatusResponse> responses;
-
-  bool ok = true;
-  multiCall("getStatus", request, &responses);
-  for (auto& response : responses) {
-    if (response.status() != status) {
-      ok = false;
-    }
-  }
-
-  return ok;
-}
-
-void ParameterClient2::setStatus(PServerStatus status) {
-  SetStatusRequest request;
-  request.set_status(status);
-  std::vector<SetStatusResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::waitForStatus(PServerStatus status) {
-  while (!inStatus(status)) {
-    sleep(1);
-  }
-}
-
-template <typename Proto>
-static void validateResponses(const std::vector<Proto>& responses) {
-  for (auto& response : responses) {
-    CHECK(response.return_message().empty())
-        << "client" << &response - &responses[0]
-        << " error:" << response.return_message();
-  }
-}
-
-PServerVector ParameterClient2::createVector() {
-  CreateVectorRequest request;
-  std::vector<CreateVectorResponse> responses;
-  int64_t handle = -1;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-
-  for (auto& response : responses) {
-    if (handle == -1) {
-      handle = response.handle();
-    } else {
-      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
-                                          << &response - &responses[0] << " "
-                                          << handle << " " << response.handle();
-    }
-  }
-  return PServerVector{handle};
-}
-
-void ParameterClient2::releaseVector(PServerVector handle) {
-  ReleaseVectorRequest request;
-  std::vector<ReleaseVectorResponse> responses;
-
-  request.set_handle(handle.handle);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-PServerMatrix ParameterClient2::createMatrix(int32_t numCols) {
-  CreateMatrixRequest request;
-  std::vector<CreateMatrixResponse> responses;
-  int64_t handle = -1;
-
-  request.set_num_cols(numCols);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-
-  for (auto& response : responses) {
-    if (handle == -1) {
-      handle = response.handle();
-    } else {
-      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
-                                          << &response - &responses[0] << " "
-                                          << handle << " " << response.handle();
-    }
-  }
-  return PServerMatrix{handle};
-}
-
-void ParameterClient2::releaseMatrix(PServerMatrix handle) {
-  ReleaseMatrixRequest request;
-  std::vector<ReleaseMatrixResponse> responses;
-
-  request.set_handle(handle.handle);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-void PreparedOperations::addOperationHelper(Operation* op, CpuVectorPtr vec) {
-  ProtoVector& pvec = *op->add_vectors();
-  size_t dim = vec->getSize();
-  pvec.set_dim(dim);
-  copyToRepeatedField(pvec.mutable_values(), vec->getData(), vec->getSize());
-}
-
-void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) {
-  ProtoMatrix& pmat = *op->add_matrices();
-  pmat.set_num_cols(mat->getWidth());
-  pmat.set_num_rows(mat->getHeight());
-  copyToRepeatedField(
-      pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows());
-}
-
-static inline real addTwo(real a, double b) { return a + b; }
-
-void ParameterClient2::doOperation(PreparedOperations& ops,
-                                   bool waitForGradient,
-                                   bool sendBackGradient,
-                                   bool releasePass) {
-  std::vector<DoOperationResponse> responses;
-  ops.request_.set_wait_for_gradient(waitForGradient);
-  ops.request_.set_send_back_parameter(sendBackGradient);
-  ops.request_.set_release_pass(releasePass);
-  multiCall(__func__, ops.request_, &responses);
-  validateResponses(responses);
-  size_t numPassFinishServers = 0;
-
-  size_t numOps = ops.request_.operations_size();
-  for (auto& response : responses) {
-    numPassFinishServers += response.pass_finish();
-    CHECK_EQ(numOps, (size_t)response.results_size());
-    for (size_t opId = 0; opId < numOps; ++opId) {
-      const OperationResult& result = response.results(opId);
-      std::vector<real*>& resultScalars = ops.localResults_[opId].resultScalars;
-      std::vector<CpuVectorPtr>& resultVectors =
-          ops.localResults_[opId].resultVectors;
-      std::vector<CpuMatrixPtr>& resultMatrices =
-          ops.localResults_[opId].resultMatrices;
-
-      if (&response == &responses[0]) {
-        /// Initialize results to zero
-
-        resultScalars.resize(result.scalars_size());
-        for (auto p : resultScalars) {
-          if (!p) continue;
-          *p = 0;
-        }
-        size_t numVectors = result.vectors_size();
-        resultVectors.resize(numVectors);
-        for (size_t i = 0; i < numVectors; ++i) {
-          if (!resultVectors[i]) continue;
-          resultVectors[i]->resize(result.vectors(i).dim());
-          resultVectors[i]->zeroMem();
-        }
-        size_t numMatrices = result.matrices_size();
-        resultMatrices.resize(numMatrices);
-        for (size_t i = 0; i < numMatrices; ++i) {
-          if (!resultMatrices[i]) continue;
-          resultMatrices[i]->resize(result.matrices(i).num_rows(),
-                                    result.matrices(i).num_cols());
-          resultMatrices[i]->zeroMem();
-        }
-      }
-
-      // aggregate results from each pserver to results
-
-      CHECK_EQ(resultScalars.size(), (size_t)result.scalars_size());
-      for (ssize_t i = 0; i < result.scalars_size(); ++i) {
-        real* rscalar = resultScalars[i];
-        if (!rscalar) continue;
-        *rscalar += result.scalars(i);
-      }
-
-      CHECK_EQ(resultVectors.size(), (size_t)result.vectors_size());
-      for (auto& vec : result.vectors()) {
-        int i = &vec - &result.vectors(0);
-        CpuVectorPtr rvec = resultVectors[i];
-        if (!rvec) continue;
-        CHECK_EQ(rvec->getSize(), (size_t)vec.dim());
-        std::transform(rvec->getData(),
-                       rvec->getData() + rvec->getSize(),
-                       vec.values().data(),
-                       rvec->getData(),
-                       addTwo);
-      }
-
-      CHECK_EQ(resultMatrices.size(), (size_t)result.matrices_size());
-      for (auto& mat : result.matrices()) {
-        int i = &mat - &result.matrices(0);
-        CpuMatrixPtr rmat = resultMatrices[i];
-        if (!rmat) continue;
-        CHECK_EQ(rmat->getHeight(), (size_t)mat.num_rows());
-        CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols());
-
-        std::transform(rmat->getData(),
-                       rmat->getData() + rmat->getElementCnt(),
-                       mat.values().data(),
-                       rmat->getData(),
-                       addTwo);
-      }
-    }
-  }
-  passFinish_ = numPassFinishServers == clients_.size();
-}
-
-real ParameterClient2::vectorDotProduct(PServerVector u, PServerVector v) {
-  real result = 0.0;
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_utv, u, v)(&result);
-  doOperation(ops, false, false);
-  return result;
-}
-
-void ParameterClient2::vectorScale(PServerVector u, real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au, u, a);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorCopy(PServerVector src, PServerVector dst) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_COPY, src, dst);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorAddMult(PServerVector u, PServerVector v, real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)1);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorAddMultInto(PServerVector u,
-                                         PServerVector v,
-                                         PServerVector w,
-                                         real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv_cw, v, w, u, (real)1, a, (real)0);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorScaleInto(PServerVector u,
-                                       PServerVector v,
-                                       real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)0);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::loadValueVector(const std::string& dirName) {
-  LoadValueRequest request;
-  request.set_dir_name(dirName);
-  std::vector<LoadValueResponse> responses;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-void ParameterClient2::saveValueVector(const std::string& dirName) {
-  SaveValueRequest request;
-  request.set_dir_name(dirName);
-  std::vector<SaveValueResponse> responses;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-}  // namespace paddle
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
deleted file mode 100644
index c96bb787151a525556c8217629109de201762cff..0000000000000000000000000000000000000000
--- a/paddle/pserver/ParameterClient2.h
+++ /dev/null
@@ -1,602 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-#include <mutex>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/pserver/BaseClient.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/Util.h"
-
-#include "ParameterService.pb.h"
-
-#include "ProtoServer.h"
-#include "SparseParameterDistribution.h"
-
-DECLARE_int32(parallel_thread_num);
-
-namespace paddle {
-
-struct PServerMatrix {
-  int64_t handle;
-};
-
-struct PServerVector {
-  int64_t handle;
-};
-
-/**
- * @brief A class to help to prepare server-side operations.
- */
-class PreparedOperations {
- protected:
-  class ResultsAdder;
-  struct LocalOperationResult;
-
- public:
-  /**
-   * Offers an easy way to prepare operations that will be performed on
-   * server-side.
-   *
-   * Usage:
-   * @code
-   *   addOperation(optype, arguments...)(results...)
-   * @endcode
-   *
-   * Examples:
-   * 1. set pserver vector to 1:
-   * @code
-   *   PServerVector u = parameterClient.createVector();
-   *   addOperation(PSERVER_OP_RESET, u, (real)1);
-   * @endcode
-   *
-   * 2. Compute inner product of to pserver vectors.
-   * @code
-   *   PServerVector u = parameterClient.createVector();
-   *   PServerVector v = parameterClient.createVector();
-   *   real result;
-   *   addOperation(PSERVER_OP_utv, u, v)(&result)
-   * @endcode
-   *
-   * @param[in] operation The operation that pserver will perform.
-   * @param[in] args Argument list of the operation
-   * @return A ResultsAdder object initialized with the last element of
-   *         localResults_.
-   */
-  template <typename... Args>
-  ResultsAdder addOperation(MatrixVectorOperation operation, Args... args) {
-    Operation* op = request_.add_operations();
-    op->set_operation(operation);
-    localResults_.emplace_back();
-    addOperationHelper(op, args...);
-    return ResultsAdder(&localResults_.back());
-  }
-
- protected:
-  void addOperationHelper(Operation* op) {}
-
-  /**
-   * @brief Helper function to add an new operation that takes a PServerVector
-   *        as an operand.
-   */
-  void addOperationHelper(Operation* op, PServerVector arg) {
-    op->add_pvectors(arg.handle);
-  }
-
-  /**
-   * @brief Helper function to add an new operation that takes a PServerMatrix
-   *        as an operand.
-   */
-  void addOperationHelper(Operation* op, PServerMatrix arg) {
-    op->add_pmatrices(arg.handle);
-  }
-
-  /**
-   * @brief Helper function to add an new operation that takes a real valued
-   *        scalar as an operand.
-   */
-  void addOperationHelper(Operation* op, real arg) { op->add_scalars(arg); }
-
-  /**
-   * @brief Helper function to add an new operation that takes a CpuVectorPtr
-   *        as an operand.
-   * @note The array of CpuVectors that arg points to will be copied to
-   *       op's vectors field.
-   */
-  void addOperationHelper(Operation* op, CpuVectorPtr arg);
-
-  /**
-   * @brief Helper function to add an new operation that takes a CpuMatrixPtr
-   *        as an operand.
-   * @note The array of CpuMatrixs that arg points to will be copied to
-   *       op's matrices field.
-   */
-  void addOperationHelper(Operation* op, CpuMatrixPtr arg);
-
-  /**
-   * @brief Helper function to add an new operation and prepare the operands.
-   *
-   * @tparam Arg An operand of the operation.
-   * @tparam Args A list of rest operands of the operation.
-   * @param op Pointer to an Operation object.
-   */
-  template <typename Arg, typename... Args>
-  void addOperationHelper(Operation* op, Arg arg, Args... args) {
-    addOperationHelper(op, arg);
-    addOperationHelper(op, args...);
-  }
-
-  /**
-   * @brief ResultsAdder offers easy ways to quickly store operation results.
-   */
-  class ResultsAdder {
-   public:
-    explicit ResultsAdder(LocalOperationResult* localResult)
-        : localResult_(localResult) {}
-    template <typename... Args>
-    void operator()(Args... args) {
-      addResult(args...);
-    }
-    void addResult() {}
-    void addResult(real* arg) { localResult_->resultScalars.push_back(arg); }
-    void AddResult(CpuVectorPtr arg) {
-      localResult_->resultVectors.push_back(arg);
-    }
-    void AddResult(CpuMatrixPtr arg) {
-      localResult_->resultMatrices.push_back(arg);
-    }
-    template <typename Arg, typename... Args>
-    void addResult(Arg arg, Args... args) {
-      addResult(arg);
-      addResult(args...);
-    }
-
-   protected:
-    LocalOperationResult* localResult_;
-  };
-
- protected:
-  DoOperationRequest request_;
-  std::vector<iovec> inputIovs_;
-  struct LocalOperationResult {
-    std::vector<real*> resultScalars;
-    std::vector<CpuVectorPtr> resultVectors;
-    std::vector<CpuMatrixPtr> resultMatrices;
-  };
-  std::vector<LocalOperationResult> localResults_;
-  friend class ParameterClient2;
-};
-
-struct ParameterSegments {
-  std::string name;  // name of the parameter
-  size_t id;         // id of the parameter
-};
-
-/**
- * The client interface for parameter server. ParameterClient2 supports 2 modes
- * for managing connections to parameter servers, in the 1st mode one connection
- * is shared by 2 threads that are separately responsible for sending and
- * recieving activities, in the 2nd mode one connection is owned by only one
- * thread, and all the sending and recieving activities run in that single
- * thread.
- * TODO(yanfei):
- * Additional core idea to further optimizate pserver performance is
- * to do sync-sgd based parameter level instead of pserver level.
- * full-parallelization based parameter level for sync-sgd also can
- * sense forwardbackward computation layer-by-layer for more deeper layer
- * model.
- * Firstly, pserver can do full-parallelization on all computation based
- * parameter level instead of waiting for all gradients are finished and
- * start to send back parameters value immediately if parameter is ready
- * instead of waiting for all parameters value are ready
- * Secondly, parameter client can write back parameters to GPU instead of
- * waiting until all parameters are received to CPU host end.
- */
-class ParameterClient2 : public BaseClient {
- public:
-  /** Constructor.
-   * @param separate True if sending and recieving activities are separated
-   *                 into 2 threads, otherwise false.
-   * @param port Port number that parameter client runs on.
-   * @param numPorts Number of ports parameter clients occupies,
-   *                 numPorts * pserver number is the total number of
-   *                 connections the parameter client maintains.
-   */
-  ParameterClient2(bool separate = false,
-                   int port = FLAGS_port,
-                   int numPorts = FLAGS_ports_num);
-
-  ~ParameterClient2();
-
-  static int calcParameterBlockSize(const std::vector<ParameterPtr>& parameters,
-                                    size_t serviceNum);
-
- public:
-  bool init(const std::vector<ParameterPtr>& parameters);
-
-  /// service functions
-
-  /**
-   * @brief Sends the segments in parameter to parameter servers, then receives
-   *        the response from the servers.
-   * @param[in] updateMode Indicates how parameters should be updated on the
-   *            server side.
-   * @param[in] parameterType Type of parameter that will be sent.
-   * @param[in] segments Segments in the parameter that will be sent.
-   * @param[in] numSamples Number of samples this update is based on.
-   * @param[in] cost Cost of the batch, will be used to calculate global object
-   *            value.
-   * @param[in] sendBackParameter True if the updated parameters should be sent
-   *            back, otherwise false.
-   * @param[in] sendBackParameterType Send back parameter type on pserver,
-   *            PARAMETER_VALUE by default
-   * @param[in] recvParameterType pserver[sendBackParameterType] will be copy to
-   *            client[recvParameterType]
-   * @note Only parameterType will be sent.
-   */
-  void sendAndReceiveParameter(ParameterUpdateMode updateMode,
-                               ParameterType parameterType,
-                               const std::vector<ParameterSegments>& segments,
-                               int64_t numSamples,
-                               real cost,
-                               bool sendBackParameter,
-                               ParameterType sendBackParameterType,
-                               ParameterType recvParameterType);
-
-  /**
-   * @brief Sends all parameters to parameter servers, and receives the response
-   *        from the servers.
-   */
-  void sendAndReceiveParameter(
-      ParameterUpdateMode updateMode,
-      ParameterType parameterType,
-      int64_t numSamples,
-      real cost,
-      bool sendBackParameter,
-      ParameterType sendBackParameterType = PARAMETER_VALUE,
-      ParameterType recvParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(updateMode,
-                            parameterType,
-                            allSegments_,
-                            numSamples,
-                            cost,
-                            sendBackParameter,
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /**
-   * @brief Sends the segments in parameter to parameter servers. Each
-   *        sendParameter() must be paired with a recvParameter() in the future.
-   *        Only parameterType will be sent.
-   *
-   * @param[in] updateMode Indicates how parameters should be updated on the
-   *            server side.
-   * @param[in] parameterType Type of parameter that will be sent.
-   * @param[in] segments Segments in the parameter that will be sent.
-   * @param[in] numSamples Number of samples this update is based on.
-   * @param[in] cost Cost of the batch, will be used to calculate global object
-   *            value.
-   * @param[in] sendBackParameter True if the updated parameters should be sent
-   *            back, otherwise false.
-   * @param[in] batchStatus Status of the batch.
-   * @note This function is non-blocking. This means that parameter should
-   *       not change between this call and recvParameter()
-   */
-  void sendParameter(ParameterUpdateMode updateMode,
-                     ParameterType parameterType,
-                     const std::vector<ParameterSegments>& segments,
-                     int64_t numSamples,
-                     real cost,
-                     bool sendBackParameter,
-                     BatchStatus batchStatus);
-
-  void recvParameter();
-
-  /**
-   * Sends all parameters to parameter servers, recvParameter() have to be
-   * invoked
-   * afterwards.
-   *
-   * @note This function is non-blocking. This means that if parameter should
-   *       not changes between this call and recvParameter()
-   */
-  void sendParameter(ParameterUpdateMode updateMode,
-                     ParameterType parameterType,
-                     int64_t numSamples,
-                     real cost,
-                     bool sendBackParameter,
-                     BatchStatus batchStatus) {
-    sendParameter(updateMode,
-                  parameterType,
-                  allSegments_,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  batchStatus);
-  }
-
-  /// Get all parameters from parameter servers
-  void getParameter(ParameterType recvParameterType = PARAMETER_VALUE,
-                    ParameterType sendBackParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
-                            PARAMETER_VALUE,
-                            0,     // numSamples = 0
-                            0,     // cost = 0
-                            true,  // sendBackParameter = true
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /// Get parameters by sparse row ids from parameter servers
-  void getParameterSparse(
-      ParameterType recvParameterType = PARAMETER_VALUE,
-      ParameterType sendBackParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM_SPARSE,
-                            PARAMETER_VALUE,
-                            0,     // numSamples = 0
-                            0,     // cost = 0
-                            true,  // sendBackParameter = true
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /// Set all parameters on parameter servers using the local parameters
-  void setParameter() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
-                            PARAMETER_VALUE,
-                            0,       // numSamples = 0
-                            0,       // cost = 0
-                            false);  // sendBackParameter = false
-  }
-  /**
-   * Set all parameters on parameter servers, values will be zero
-   * means do not sending local parameters
-   */
-  void setParameterZero() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO,
-                            PARAMETER_VALUE,
-                            0,       // numSamples = 0
-                            0,       // cost = 0
-                            false);  // sendBackParameter = false
-  }
-
-  /**
-   * @brief Wait until all gradient servers start one pass.
-   *
-   * @note This is now only used by the gradient servers for "sgd"
-   *       algorithm. Calling this function means that the calling gradient
-   *       server is ready to start a new pass.
-   */
-  void waitPassStart();
-
-  /**
-   * @brief Wait until all gradient servers finish one pass.
-   *
-   * @note This is now only used by the gradient servers for "sgd" algorithm.
-   *       Calling this function means that the calling gradient server
-   *       finishes one pass.
-   */
-  void waitPassFinish();
-
-  /// Wait until all gradient servers call this function.
-  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  /// Called when async-sgd finish pass.
-  void asyncFinishPass(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  void asyncStartPass(SyncObject syncObjectId = SYNC_DEFAULT) {
-    return synchronize(syncObjectId);
-  }
-
-  /**
-   * @brief Execute the prepared operations on pservers, fetch the results and
-   *        aggregate results from different pservers.
-   * @param[in] ops Prepared operations that will be executed on pservers.
-   * @param[in] waitForGradient If true, wait for gradient to be ready before
-   *            starting the operations.
-   * @param[in] sendBackParameter If true, send back the parameter to clients
-   *            after the operations are finished.
-   * @param[in] If true, and if all clients call waitPassFinish, signal all
-   *            clients finish the pass.
-   */
-  void doOperation(PreparedOperations& ops,
-                   bool waitForGradient,
-                   bool sendBackParameter,
-                   bool releasePass = true);
-
-  /**
-   * Set the configuration of pserver, including parameter config and
-   * optimization config
-   */
-  void setConfig(const OptimizationConfig& optConfig,
-                 const std::string& saveDir = "",
-                 bool isSparseServer = false);
-
-  /// Return true if all pservers are in the given status
-  bool inStatus(PServerStatus status);
-  bool isPassFinish() { return passFinish_; }
-
-  /// Set pserver status
-  void setStatus(PServerStatus status);
-
-  /**
-   * @brief Wait until all pservers are at status
-   * @note This function is not suitable for frequent use,
-   *       because it sleeps 1 second each time when condition is satisfied.
-   */
-  void waitForStatus(PServerStatus status);
-
-  /// Create a column vector. The size is the dimension of parameter.
-  PServerVector createVector();
-
-  /// Release the PServerVector given handle.
-  void releaseVector(PServerVector handle);
-
-  /**
-   * Create a column major matrix. The number of rows is the dimension of
-   * parameter. The number of columns is specifed by numCols.
-   */
-  PServerMatrix createMatrix(int32_t numCols);
-
-  /// Release the PServerMatrix given handle.
-  void releaseMatrix(PServerMatrix handle);
-
-  // Some basic algebra functions
-  /// Calculate the dot product of u and v
-  real vectorDotProduct(PServerVector u, PServerVector v);
-
-  /// Scale u by a
-  void vectorScale(PServerVector u, real a);
-
-  /// Copy from src to dest
-  void vectorCopy(PServerVector src, PServerVector dst);
-
-  /// u += v * a
-  void vectorAddMult(PServerVector u, PServerVector v, real a);
-
-  /// u = v + w * a
-  void vectorAddMultInto(PServerVector u,
-                         PServerVector v,
-                         PServerVector w,
-                         real a);
-  /// u = v * a
-  void vectorScaleInto(PServerVector u, PServerVector v, real a);
-
-  /// Return pserver parameter value.
-  PServerVector getPServerParameterValue() {
-    PServerVector vec;
-    vec.handle = PARAMETER_VALUE;
-    return vec;
-  }
-
-  /// Return pserver parameter gradient.
-  PServerVector getPServerParameterGradient() {
-    PServerVector vec;
-    vec.handle = PARAMETER_GRADIENT;
-    return vec;
-  }
-
-  /**
-   * Tell pservers to load value vector from file.
-   *
-   * @param[in] dirName The directory that contains the value vector file.
-   */
-  void loadValueVector(const std::string& dirName);
-
-  /// Tell pservers to save value vector to file.
-  void saveValueVector(const std::string& dirName);
-
-  void setTrainerId(int trainerId) { trainerId_ = trainerId; }
-
-#ifndef PADDLE_DISABLE_TIMER
-  void setForwardbackwardTime(uint64_t delta) { forwardbackwordTime_ = delta; }
-#endif
-
- protected:
-  template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName,
-                 const ProtoIn& request,
-                 std::vector<ProtoOut>* responses) {
-    responses->resize(clients_.size());
-    size_t numClients = clients_.size();
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].send(funcName, request);
-    }
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].recv(&(*responses)[i]);
-    }
-  }
-
- private:
-  void destroy();
-
-  /**
-   * @brief management function for parallelizing send/recv all connections
-   *        to all pservers. it is called under one SyncThreadPool. it
-   *        supports to use N thread to control M connections. the receiving
-   *        actions can be started until all sending action to all connections
-   *        owned by current thread are finished. Different connections
-   * controlled
-   *        by different threads can transfer data asynchronously.
-   */
-  void sendParallel(int tid,
-                    size_t numThreads,
-                    ParameterType recvParameterType);
-  /// sending thread routine for asynchronously send data
-  void send(int threadId);
-  /// receiving thread routing for asynchronously receive data
-  void recv(int threadId);
-
-  /**
-   * @brief main routine to build data for pserver
-   *
-   * @note  it can prepare different kinds of parameter type data. it can
-   *        be regarded as layer for bridging real parameters data and
-   *        protobuf data for communication.
-   *        TODO(yanfei):
-   *        can abstract additional layer to encode and decode data to/from
-   *        protobuf data.
-   */
-  void prepareSendData(
-      ParameterUpdateMode updateMode,
-      ParameterType parameterType,  // client send type
-      const std::vector<ParameterSegments>& parameterSegments,
-      int64_t numSamples,
-      real cost,
-      bool sendBackParameter,
-      ParameterType sendBackParameterType,  // send back type in pserver
-      BatchStatus batchStatus,
-      SendJob* sendJob);
-
-  /// start necessary threads for threadPool
-  void initThreads();
-
- protected:
-  /// start port number of pserver
-  /// it deduce all ports for dense and sparse with some rules
-  int port_;
-  /// identify the trainer id using this client
-  int trainerId_;
-
-#ifndef PADDLE_DISABLE_TIMER
-  uint64_t forwardbackwordTime_;
-#endif
-  std::mutex sparseAutoGrowthMutex_;
-
-  /// map id to parameter used for decoding protobuf data
-  std::unordered_map<size_t, ParameterPtr> parameterMap_;
-  /// segments for all parameters that needed to sync
-  std::vector<ParameterSegments> allSegments_;
-
-  /// module for sensing sparse parameters distribution on all pservers
-  std::unique_ptr<SparseParameterDistribution> sparseDistribution_;
-
-  /// thread pool for parallelizing all connections to pservers
-  std::unique_ptr<SyncThreadPool> syncThreadPool_;
-
-  bool passFinish_;
-};
-
-}  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
deleted file mode 100644
index f8814714c29a9776adde6a979a84241f733f65bd..0000000000000000000000000000000000000000
--- a/paddle/pserver/ParameterServer2.cpp
+++ /dev/null
@@ -1,1401 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterServer2.h"
-
-#include <algorithm>
-#include <fstream>
-
-#include "paddle/math/SIMDFunctions.h"
-#include "paddle/parameter/AverageOptimizer.h"
-#include "paddle/parameter/FirstOrderOptimizer.h"
-#include "paddle/parameter/OptimizerFunctions.h"
-#include "paddle/parameter/OptimizerWithRegularizer.h"
-#include "paddle/parameter/ParameterOptimizer.h"
-#include "paddle/parameter/ParameterUpdateFunctions.h"
-#include "paddle/parameter/Regularizer.h"
-#include "paddle/parameter/ThreadLocalBuffer.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
-
-DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
-DEFINE_double(async_lagged_ratio_min,
-              1.0,
-              "control config_.async_lagged_grad_discard_ratio() min value");
-DEFINE_double(
-    async_lagged_ratio_default,
-    1.5,
-    "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
-    "use it as defalut value");
-
-namespace paddle {
-
-const std::string ParameterServer2::kRetMsgInvalidMatrixHandle =
-    "Invalid matrix handle";
-const std::string ParameterServer2::kRetMsgInvalidVectorHandle =
-    "Invalid vector handle";
-const std::string ParameterServer2::kRetMsgUnknownOperation =
-    "Unknown operation";
-
-ParameterServer2::ParameterServer2(const std::string& addr,
-                                   int port,
-                                   int rdmaCpu)
-    : ProtoServer(addr, port, rdmaCpu),
-      dataSize_(0),
-      size_(0),
-      gradientReadyBarrier_(FLAGS_num_gradient_servers + 1),
-      parameterReadyBarrier_(FLAGS_num_gradient_servers + 1),
-      passBarrier_(FLAGS_num_gradient_servers + 1),
-      numPassFinishClients_(0),
-      allClientPassFinish_(false),
-      serverId_(-1),
-      batchId_(-1) {
-  /**
-   * register function for remote client calling, these functions
-   * will be mapped to a data structure for quick looking up. each
-   * request from trainer can contains one function name to indicate
-   * remote action. this architecture looks like rpc style for pserver.
-   */
-  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendParameter);
-  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendData);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, setConfig);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, setStatus);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, getStatus);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, doOperation);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, createVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, createMatrix);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseMatrix);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassStart);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassFinish);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, synchronize);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, asyncFinishPass);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, loadValueVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, saveValueVector);
-
-  /// thread pool for parallelizing some computations
-  if (FLAGS_pserver_num_threads > 1) {
-    syncThreadPool_.reset(new SyncThreadPool(FLAGS_pserver_num_threads, false));
-  }
-}
-
-bool ParameterServer2::init() {
-  vectors_.resize(NUM_PARAMETER_TYPES);
-  configMap_.clear();
-
-  numSamplesProcessed_ = 0;
-  cost_ = 0;
-  char* mpienv = getenv("OMPI_COMM_WORLD_SIZE");
-  if (mpienv != NULL) {
-    mpiSize_ = atoi(mpienv);
-  } else {
-    mpiSize_ = 1;
-  }
-  status_ = PSERVER_STATUS_NOT_SET;
-  dataMems_.resize(FLAGS_num_gradient_servers);
-  synchronizeBarriers_.resize(SyncObject_ARRAYSIZE);
-  for (auto& barrier : synchronizeBarriers_) {
-    barrier.reset(new ThreadBarrier(FLAGS_num_gradient_servers));
-  }
-
-  // initialization for dicarding lagging gradient
-  asyncUpdateSteps_ = 0;
-  asyncTrainerSteps_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
-  asyncLaggedGradientsNum_ = 0;
-  asyncUpdateStat_.resize(static_cast<int>(FLAGS_num_gradient_servers *
-                                           FLAGS_async_lagged_ratio_default));
-  asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
-  asyncTrainerDiscardStat_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
-  asyncTrainerCommitStat_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
-
-  return true;
-}
-
-void ParameterServer2::getStatus(const GetStatusRequest& request,
-                                 ProtoResponseCallback callback) {
-  (void)request;
-  GetStatusResponse response;
-  response.set_status(status_);
-  callback(response);
-}
-
-void ParameterServer2::setStatus(const SetStatusRequest& request,
-                                 ProtoResponseCallback callback) {
-  status_ = request.status();
-  SetStatusResponse response;
-  callback(response);
-}
-
-void ParameterServer2::setConfig(const SetConfigRequest& request,
-                                 ProtoResponseCallback callback) {
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-
-    serverId_ = request.server_id();
-    isSparseServer_ = request.is_sparse_server();
-
-    if (!request.save_dir().empty()) {
-      mkDir(request.save_dir().c_str());
-    }
-
-    for (const auto& config : request.param_configs()) {
-      CHECK(!configMap_.count(config.para_id()))
-          << "Duplicated parameter name: " << config.name();
-      configMap_[config.para_id()] = config;
-      CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
-    }
-
-    config_ = request.opt_config();
-    if (config_.algorithm() == TrainAlgorithm::AsyncSGD) {
-      auto asyncLaggedRatio = config_.async_lagged_grad_discard_ratio();
-      if (asyncLaggedRatio <= FLAGS_async_lagged_ratio_min) {
-        LOG(INFO) << "WARNING: async_lagged_grad_discard_ratio is too small"
-                  << "reset to default, async_lagged_grad_discard_ratio = "
-                  << FLAGS_async_lagged_ratio_default;
-        asyncLaggedRatio = FLAGS_async_lagged_ratio_default;
-      }
-      asyncLaggedThreshold_ =
-          static_cast<int64_t>(FLAGS_num_gradient_servers * asyncLaggedRatio);
-      LOG(INFO) << "discard lagged async gradient ratio: " << asyncLaggedRatio
-                << " asyncLaggedhreshold: " << asyncLaggedThreshold_;
-    }
-    if (isSparseServer_ && config_.num_batches_per_send_parameter() > 1) {
-      /// sparse server must NOT use local update mode
-      config_.set_num_batches_per_send_parameter(1);
-    }
-
-    if (config_.num_batches_per_send_parameter() > 1 &&
-        config_.center_parameter_update_method() == "average") {
-      /// scaling L1/L2 decay rate as large as L1/L2 apply in trainer
-      /// if parameter regularization in pserver
-      for (auto& pair : configMap_) {
-        ParameterConfig& config = pair.second;
-        if (config_.num_batches_per_send_parameter() ==
-            config.num_batches_regularization()) {
-          real scale =
-              config_.delta_add_rate() * config.num_batches_regularization();
-          if (config_.algorithm() == "sgd") {
-            scale *= FLAGS_num_gradient_servers;
-          }
-          config.set_decay_rate(config.decay_rate() * scale);
-          if (config.decay_rate() > 0.1f) {
-            LOG(FATAL) << "L2 decay=" << config.decay_rate()
-                       << " for parameter:" << config.name()
-                       << " is too large after scale in pserver!";
-          }
-          config.set_decay_rate_l1(config.decay_rate_l1() * scale);
-          if (config.decay_rate_l1() > 0.1f) {
-            LOG(FATAL) << "L1 decay=" << config.decay_rate_l1()
-                       << " for parameter:" << config.name()
-                       << " is too large after scale in pserver!";
-          }
-
-          LOG(INFO) << "parameter:" << config.name()
-                    << " decay apply in pserver,"
-                    << " L1 decay=" << config.decay_rate_l1()
-                    << " L2 decay=" << config.decay_rate();
-        }
-      }
-    }
-  }
-
-  SetConfigResponse response;
-  callback(response);
-}
-
-real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
-  real sum = 0;
-  for (const auto buffer : buffers) {
-    for (size_t i = 0; i < buffer.size; ++i) {
-      sum += buffer.base[i];
-    }
-  }
-  return sum;
-}
-
-void ParameterServer2::mergeSegments(BlockSegments* segments) {
-  if (segments->empty()) {
-    return;
-  }
-  std::sort(segments->begin(), segments->end());
-  auto curr = segments->begin();
-  for (auto it = segments->begin(); it != segments->end(); ++it) {
-    if (it->first <= curr->second) {
-      curr->second = std::max(curr->second, it->second);
-    } else {
-      ++curr;
-      *curr = *it;
-    }
-  }
-  ++curr;
-  segments->erase(curr, segments->end());
-}
-
-void ParameterServer2::setParameter(const SendParameterRequest& request,
-                                    std::vector<Buffer>& inputBuffers,
-                                    SendParameterResponse* response,
-                                    std::vector<Buffer>* outputBuffers) {
-  (void)response;
-  (void)outputBuffers;
-  LOG(INFO) << "pserver: setParameter";
-  std::lock_guard<RWLock> guard(parameterMutex_);
-
-  int64_t numBlocks = blockIdMap_.size();
-  CHECK_EQ(blockIdMap_.size(), blockOffsetMap_.size());
-  /// total bytes for all the added blocks
-  int64_t totalSize = size_;
-  std::vector<int64_t> offsets;
-  offsets.reserve(request.blocks_size());
-  std::vector<int64_t> blockIds;
-  blockIds.reserve(request.blocks_size());
-  int bufferIndex = 0;
-
-  if (!request.blocks().size()) {
-    LOG(WARNING)
-        << "--ports_num or --ports_num_for_sparse might be too large, "
-        << "or total dense parameter size or sparse parameters size "
-        << "might be too small, this psever doesn't store any parameter.";
-    return;
-  }
-
-  for (const auto& block : request.blocks()) {
-    /// block size for parameter(e.g. 128 for sparse row, 1K for dense)
-    uint64_t blockSize = getParameterConfig(block).parameter_block_size();
-    BlockKey key(block.para_id(), block.block_id());
-    if (inputBuffers.size()) {  // if !=PSERVER_UPDATE_MODE_SET_PARAM_ZERO
-      Buffer buffer = inputBuffers[bufferIndex];
-      ++bufferIndex;
-      CHECK_EQ(buffer.size, block.block_size())
-          << "data size is too big:"
-          << " block_size=" << block.block_size()
-          << " data_size=" << buffer.size;
-    }
-
-    /// add a new block
-    if (blockIdMap_.count(key) == 0) {
-      blockOffsetMap_[key] = totalSize;
-      blockIdMap_[key] = numBlocks;
-      ++numBlocks;
-      totalSize += blockSize;
-    }
-    offsets.push_back(blockOffsetMap_[key]);
-    blockIds.push_back(blockIdMap_[key]);
-  }
-
-  size_ = totalSize;
-  LOG(INFO) << "pserver: new cpuvector: size=" << size_;
-  if (!vectors_[PARAMETER_VALUE]) {
-    /// vectors_
-    const auto types = sgdOptimizerGetTypes(config_, true /*inPserver*/);
-    for (const auto type : types) {
-      vectors_[type].reset(new CpuVector(size_));
-      vectors_[type]->zeroMem();
-    }
-
-    blockInfos_.resize(numBlocks);
-    for (auto& info : blockInfos_) {
-      info.lock.reset(new std::mutex());
-    }
-  } else {
-    CHECK_EQ((size_t)size_, vectors_[PARAMETER_VALUE]->getSize())
-        << "Currently adding new blocks is not supported. "
-        << "All blocks must be added in one setParameter call";
-  }
-
-  VectorPtr buf = vectors_[PARAMETER_VALUE];
-  usedSegments_.reserve(offsets.size());
-  /// if offsets is empty, means parameter_block_size is too big or too many
-  /// nodes.
-  if (offsets.empty()) {
-    LOG(WARNING) << "in setParameter: offsets is empty";
-  }
-  for (size_t i = 0; i < offsets.size(); ++i) {
-    size_t blockId = blockIds[i];
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(request.blocks(i));
-    info.config = &config;
-    info.offset = offsets[i];
-    info.optimizer.reset(sgdOptimizerCreate(
-        config_, config, config.sparse_remote_update(), true /*inPserver*/));
-    if (config.sparse_remote_update()) {
-      size_t width = config.dims(1);
-      CHECK_EQ(config.parameter_block_size(), width)
-          << "block size: " << config.parameter_block_size()
-          << "width : " << width;
-    }
-    info.optimizer->init(1, info.config);
-    usedSegments_.push_back(std::make_pair(
-        offsets[i], offsets[i] + request.blocks(i).block_size()));
-  }
-  mergeSegments(&usedSegments_);
-
-  if (request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM) {
-    /// copy param from trainer
-    for (size_t i = 0; i < offsets.size(); ++i) {
-      Buffer buffer = inputBuffers[i];
-      real* start = buf->getPoint(offsets[i]);
-      CHECK_LE(offsets[i] + buffer.size, buf->getSize());
-      memcpy(start, buffer.base, sizeof(real) * buffer.size);
-    }
-  } else {
-    CHECK(request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
-    /// nothing to do, value vector zero mem already
-  }
-}
-
-void ParameterServer2::addGradient(const SendParameterRequest& request,
-                                   std::vector<Buffer>& inputBuffers,
-                                   SendParameterResponse* response,
-                                   std::vector<Buffer>* outputBuffers) {
-  VLOG(1) << "pserver: addGradient";
-
-  {
-    ReadLockGuard guard(parameterMutex_);
-    int bufferIndex = 0;
-    for (const auto& block : request.blocks()) {
-      int64_t offset = getBlockOffset(block);
-      CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                          << " id=" << block.para_id()
-                          << " block id=" << block.block_id();
-
-      int64_t blockId = getBlockId(block);
-      CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-                           << " id=" << block.para_id()
-                           << " block id=" << block.block_id();
-
-      Buffer buffer = inputBuffers[bufferIndex];
-      ++bufferIndex;
-
-      const real* gradientBuffer = buffer.base;
-      real* gradientSumBuffer = vectors_[PARAMETER_GRADIENT]->getPoint(offset);
-
-      size_t size = buffer.size;
-
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = getParameterConfig(blockId);
-      if (config.sparse_remote_update()) {
-        CHECK_EQ(size, config.parameter_block_size());
-      } else {  // dense
-        CHECK_LE(size, config.parameter_block_size());
-      }
-      std::lock_guard<std::mutex> guard(*info.lock);
-      simd::addTo(gradientSumBuffer, gradientBuffer, size);
-    }
-  }
-  if (request.batch_status() == BATCH_FINISH ||
-      request.batch_status() == BATCH_START_AND_FINISH) {
-    numSamplesProcessed_ += request.num_samples();
-    cost_ += request.cost();
-    VLOG(1) << "num samples: " << numSamplesProcessed_
-            << ", new cost:" << cost_;
-
-    /// notify doOperation gradient ready
-    gradientReadyBarrier_.wait();
-
-    /// wait doOperation finish
-    parameterReadyBarrier_.wait();
-    VLOG(1) << "start send back";
-  }
-}
-
-bool ParameterServer2::asyncGrdientCommitCheckAndStat(
-    const SendParameterRequest& request) {
-  const auto trainerId = request.trainer_id();
-  int64_t trainerSteps = asyncTrainerSteps_[trainerId];
-  CHECK_GE(asyncUpdateSteps_, trainerSteps)
-      << " async update steps overflows "
-      << " trainer id: " << trainerId
-      << " async update steps in pserver: " << asyncUpdateSteps_
-      << " async update steps in request: " << trainerSteps;
-
-  asyncUpdateSteps_++;
-  bool commitGradient = true;
-
-  int64_t delta = asyncUpdateSteps_ - trainerSteps;
-  if (delta >= asyncLaggedThreshold_) {
-    VLOG(1) << "discard Async Update: "
-            << " trainer id: " << trainerId
-            << " pserver steps: " << asyncUpdateSteps_
-            << " request steps: " << trainerSteps;
-    asyncLaggedGradientsNum_++;
-    commitGradient = false;
-  }
-  /// stat on lagged steps, to get total discard distribution
-  if (static_cast<size_t>(delta) < asyncUpdateStat_.size()) {
-    asyncUpdateStat_[delta]++;
-  } else {
-    asyncUpdateStat_[asyncUpdateStat_.size() - 1]++;
-  }
-  /// stat on trainerId and discard, to get trainer condition
-  if (commitGradient) {
-    asyncTrainerCommitStat_[trainerId]++;
-  } else {
-    asyncTrainerDiscardStat_[trainerId]++;
-  }
-
-  return commitGradient;
-}
-
-static ThreadLocal<std::vector<bool>> localBlockBitset_;
-
-void ParameterServer2::asyncSGD(const SendParameterRequest& request,
-                                std::vector<Buffer>& inputBuffers,
-                                SendParameterResponse* response,
-                                std::vector<Buffer>* outputBuffers) {
-  int64_t numBlocks = blockIdMap_.size();
-  auto& localBlockBitset = *localBlockBitset_;
-
-  if (isSparseServer_) {
-    if (localBlockBitset.empty()) {
-      localBlockBitset.resize(numBlocks);
-    }
-    localBlockBitset.assign(numBlocks, false);
-  }
-
-  ReadLockGuard guard(parameterMutex_);
-
-  if (request.send_back_parameter()) {
-    outputBuffers->reserve(request.blocks_size());
-  }
-
-  bool commitGradient = asyncGrdientCommitCheckAndStat(request);
-
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-  size_t bufferIndex = 0;
-  for (const auto& block : request.blocks()) {
-    int64_t offset = getBlockOffset(block);
-    CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                        << " id=" << block.para_id()
-                        << " block id=" << block.block_id();
-    int64_t blockId = getBlockId(block);
-    CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-                         << " id=" << block.para_id()
-                         << " block id=" << block.block_id();
-    Buffer buffer = inputBuffers[bufferIndex];
-    ++bufferIndex;
-
-    size_t size = buffer.size;
-
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-
-    std::lock_guard<std::mutex> guard(*info.lock);
-    /// gradients are too obsolete, will be discarded
-    if (commitGradient) {
-      info.optimizer->startBatch(numSamplesProcessed_);
-
-      for (const auto type : info.optimizer->getParameterTypes()) {
-        vecs[type]->subVecFrom(*vectors_[type], offset, size);
-      }
-      vecs[PARAMETER_GRADIENT]->subVecFrom(buffer.base, 0, size);
-      info.optimizer->update(vecs, config, isSparseServer_ ? 0 : -1);
-
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    }
-
-    if (commitGradient && isSparseServer_) {
-      localBlockBitset[blockId] = true;
-    }
-
-    if (!isSparseServer_ && request.send_back_parameter()) {  // dense
-      int type = request.send_back_parameter_type();
-      sendBackParameter(block, type, response, &buffer, outputBuffers);
-    }
-  }  /// foreach block
-
-  asyncTrainerSteps_[request.trainer_id()] = asyncUpdateSteps_;
-
-  if (commitGradient && isSparseServer_) {
-    /// find blocks that trainer do not request update
-    for (int64_t blockId = 0; blockId < numBlocks; ++blockId) {
-      if (localBlockBitset[blockId]) {
-        continue;
-      }
-
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = *info.config;
-      size_t size = config.parameter_block_size();
-
-      std::lock_guard<std::mutex> guard(*info.lock);
-      info.optimizer->startBatch(numSamplesProcessed_);
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, info.offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    }
-  }
-
-  if (commitGradient && (request.batch_status() == BATCH_FINISH ||
-                         request.batch_status() == BATCH_START_AND_FINISH)) {
-    numSamplesProcessed_ += request.num_samples();
-  }
-
-  /// show some performance log if needed
-  if (request.trainer_id() == 0) {
-    /// batchId_ is approximately equal to "real batchId_"
-    batchId_++;
-  }
-}
-
-void ParameterServer2::getParameter(const SendParameterRequest& request,
-                                    std::vector<Buffer>& inputBuffers,
-                                    SendParameterResponse* response,
-                                    std::vector<Buffer>* outputBuffers) {
-  (void)inputBuffers;
-  LOG(INFO) << "pserver: getParameter";
-  ReadLockGuard guard(parameterMutex_);
-  for (const auto& block : request.blocks()) {
-    int type = request.send_back_parameter_type();
-    sendBackParameter(block, type, response, outputBuffers);
-  }
-}
-
-void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
-                                          std::vector<Buffer>& inputBuffers,
-                                          SendParameterResponse* response,
-                                          std::vector<Buffer>* outputBuffers) {
-  (void)inputBuffers;
-  auto& buffer = *readWriteBuffer_;
-  size_t numReals = 0;
-  for (const auto& block : request.blocks()) {
-    numReals += getParameterConfig(block).dims(1);
-  }
-  buffer.resize(numReals);
-
-  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
-
-  ReadLockGuard guard(parameterMutex_);
-  size_t offset = 0;
-  for (const auto& block : request.blocks()) {
-    size_t width = getParameterConfig(block).dims(1);
-    Buffer buf = {buffer.data() + offset, width};
-    int type = request.send_back_parameter_type();
-    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
-    offset += width;
-  }
-}
-
-void ParameterServer2::sendBackParameter(const ParameterBlock& block,
-                                         int parameterType,
-                                         SendParameterResponse* response,
-                                         std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  outputBuffers->push_back({valueBuffer, (size_t)block.block_size()});
-}
-
-void ParameterServer2::sendBackParameter(const ParameterBlock& block,
-                                         int parameterType,
-                                         SendParameterResponse* response,
-                                         Buffer* buffer,
-                                         std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  size_t size = buffer->size;
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  /// copy to second buffer to avoid to be polluted by other request
-  memcpy(buffer->base, valueBuffer, sizeof(real) * size);
-  outputBuffers->push_back({buffer->base, size});
-}
-
-void ParameterServer2::sendBackParameterSparse(
-    const ParameterBlock& block,
-    int parameterType,
-    SendParameterResponse* response,
-    Buffer* buffer,
-    size_t width,
-    std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  CHECK_EQ(buffer->size, width);
-  memcpy(buffer->base, valueBuffer, width * sizeof(real));
-  outputBuffers->push_back(*buffer);
-}
-
-void ParameterServer2::readAllBlocks(
-    MsgReader* msgReader, std::vector<ParameterServer2::Buffer>* buffers) {
-  auto& buffer = *readWriteBuffer_;
-  size_t numBlocks = msgReader->getNumBlocks();
-  buffer.resizeWithAlignHints(msgReader->getTotalLength() / sizeof(real),
-                              numBlocks);
-  std::vector<void*> bufs(numBlocks);
-  buffers->clear();
-  buffers->reserve(numBlocks);
-  buffer.resetAlignAlloc();
-  for (size_t i = 0; i < numBlocks; ++i) {
-    size_t len = msgReader->getBlockLength(i);
-    CHECK_EQ(len % sizeof(real), (size_t)0);
-    size_t size = len / sizeof(real);
-    bufs[i] = buffer.nextBlock(size);
-    buffers->push_back({(real*)bufs[i], size});
-  }
-  msgReader->readBlocks(bufs);
-}
-
-void ParameterServer2::sendParameter(const SendParameterRequest& request,
-                                     std::unique_ptr<MsgReader> msgReader,
-                                     ProtoResponseCallbackEx callback) {
-  SendParameterResponse response;
-  std::vector<Buffer> inputBuffers;
-  std::vector<Buffer> outputBuffers;
-  readAllBlocks(msgReader.get(), &inputBuffers);
-  msgReader.reset();
-
-  switch (request.update_mode()) {
-    case PSERVER_UPDATE_MODE_SET_PARAM:
-    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
-      setParameter(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_GET_PARAM:
-      getParameter(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
-      getParameterSparse(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_ASYNC_SGD:
-      asyncSGD(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
-      addGradient(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
-      break;
-  }
-  switch (request.update_mode()) {
-    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
-      (*requestVec_).push_back(request);
-      (*callbackVec_).push_back(callback);
-      if (request.batch_status() == BATCH_FINISH ||
-          request.batch_status() == BATCH_START_AND_FINISH) {
-        for (size_t i = 0; i < (*requestVec_).size(); i++) {
-          ReadLockGuard guard(parameterMutex_);
-          SendParameterRequest& request = (*requestVec_)[i];
-          SendParameterResponse responseTemp;
-
-          std::vector<iovec> outputIovs;
-          if (request.send_back_parameter()) {
-            CHECK(!isSparseServer_);
-            std::vector<Buffer> outputBuffersTemp;
-            for (const auto& block : request.blocks()) {
-              int type = request.send_back_parameter_type();
-              sendBackParameter(block, type, &responseTemp, &outputBuffersTemp);
-            }
-            outputIovs.reserve(outputBuffersTemp.size());
-            for (auto buffer : outputBuffersTemp) {
-              outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
-            }
-          }
-
-          ProtoResponseCallbackEx& callbackTemp = (*callbackVec_)[i];
-          callbackTemp(responseTemp, outputIovs);
-        }
-        (*requestVec_).clear();
-        (*callbackVec_).clear();
-      }
-      break;
-    case PSERVER_UPDATE_MODE_SET_PARAM:
-    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
-    case PSERVER_UPDATE_MODE_GET_PARAM:
-    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
-    case PSERVER_UPDATE_MODE_ASYNC_SGD:
-    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
-      std::vector<iovec> outputIovs;
-      outputIovs.reserve(outputBuffers.size());
-      for (auto buffer : outputBuffers) {
-        outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
-      }
-      callback(response, outputIovs);
-      break;
-  }
-}
-
-template <typename Dtype>
-void ParameterServer2::reduceAndSendData(const SendDataRequest& request,
-                                         std::unique_ptr<MsgReader>& msgReader,
-                                         ProtoResponseCallbackEx& callback) {
-  SendDataResponse response;
-  response.set_type(request.type());
-  response.set_server_id(serverId_);
-
-  auto sendData = reinterpret_cast<Dtype*>(dataMems_[0].get()->getBuf());
-  size_t rawMemSize = dataMems_[0].get()->getSize();
-  CHECK_EQ(rawMemSize % sizeof(Dtype), 0U);
-  size_t dataMemSize = rawMemSize / sizeof(Dtype);
-  for (size_t i = 1; i < dataMems_.size(); ++i) {
-    CHECK_EQ(dataMems_[i].get()->getSize(), rawMemSize);
-    auto data = reinterpret_cast<Dtype*>(dataMems_[i].get()->getBuf());
-    for (size_t j = 0; j < dataMemSize; ++j) {
-      sendData[j] += data[j];
-    }
-  }
-  std::vector<iovec> outputIovs;
-  auto block = response.add_blocks();
-  outputIovs.push_back({sendData, rawMemSize});
-  block->set_total_size(rawMemSize);
-  block->set_data_size(sizeof(Dtype));
-  callback(response, outputIovs);
-}
-
-void ParameterServer2::templateReduceSum(const SendDataRequest& request,
-                                         std::unique_ptr<MsgReader>& msgReader,
-                                         ProtoResponseCallbackEx& callback) {
-  const auto& block = request.blocks(0);
-  switch (block.data_type()) {
-    case TRANS_FLOAT:
-      reduceAndSendData<float>(request, msgReader, callback);
-      break;
-    case TRANS_DOUBLE:
-      reduceAndSendData<double>(request, msgReader, callback);
-      break;
-    case TRANS_INT32:
-      reduceAndSendData<int>(request, msgReader, callback);
-      break;
-    case TRANS_UINT32_T:
-      reduceAndSendData<uint32_t>(request, msgReader, callback);
-      break;
-    case TRANS_INT64_T:
-      reduceAndSendData<int64_t>(request, msgReader, callback);
-      break;
-    case TRANS_UINT64_T:
-      reduceAndSendData<uint64_t>(request, msgReader, callback);
-      break;
-    default:
-      LOG(FATAL) << "not supported";
-      break;
-  }
-}
-
-void ParameterServer2::sendData(const SendDataRequest& request,
-                                std::unique_ptr<MsgReader> msgReader,
-                                ProtoResponseCallbackEx callback) {
-  SendDataResponse response;
-  response.set_type(request.type());
-  response.set_server_id(serverId_);
-
-  switch (request.update_mode()) {
-    case DATA_UPDATE_MODE_SET_OWN: {
-      CHECK_EQ(msgReader->getNumBlocks(), (size_t)(request.blocks_size()));
-      size_t totalLen = msgReader->getTotalLength();
-      if (totalLen > 0) {
-        CHECK_EQ(msgReader->getNumBlocks(), 1U)
-            << "Only one block currently support now!";
-        const auto& block = request.blocks(0);
-        if (0 == dataSize_) {
-          dataSize_ = block.data_size();
-        } else {
-          CHECK_EQ(dataSize_, block.data_size());
-        }
-        int64_t serverId = request.server_id();
-        if (serverId_ < 0) {
-          serverId_ = serverId;
-        } else {
-          CHECK_EQ(serverId_, serverId);
-        }
-        int64_t clientId = request.client_id();
-        dataMems_[clientId] = std::make_shared<CpuMemoryHandle>(totalLen);
-        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
-        msgReader->readNextBlock(dataMems_[clientId].get()->getBuf());
-      }
-      msgReader.reset();
-      std::vector<iovec> outputIovs;
-      callback(response, outputIovs);
-      break;
-    }
-    case DATA_UPDATE_MODE_GET_ALL: {
-      /// Currently only support DATA_REDUCE_SUM
-      /// And their Operations are just add
-      CHECK(DATA_REDUCE_SUM == request.type());
-      templateReduceSum(request, msgReader, callback);
-      break;
-    }
-    default: { LOG(FATAL) << "not supported"; }
-  }
-}
-
-void ParameterServer2::clearUnusedSegments(CpuVector* vec) {
-  real* data = vec->getData();
-  if (usedSegments_.empty()) {
-    return;
-  }
-  memset(data, 0, sizeof(real) * usedSegments_[0].first);
-  memset(data + usedSegments_.back().second,
-         0,
-         sizeof(real) * (size_ - usedSegments_.back().second));
-  size_t n = size_ - usedSegments_.back().second;
-
-  for (size_t i = 1; i < usedSegments_.size(); ++i) {
-    memset(
-        data + usedSegments_[i - 1].second,
-        0,
-        sizeof(real) * (usedSegments_[i].first - usedSegments_[i - 1].second));
-    n += usedSegments_[i].first - usedSegments_[i - 1].second;
-  }
-}
-
-void ParameterServer2::parallelExecForEachBlock(ExecFunc func) {
-  SyncThreadPool::execHelper(
-      syncThreadPool_.get(), [&](int tid, size_t numThreads) {
-        int64_t numBlocks = blockIdMap_.size();
-        VectorPtr* vecs = parameter::getThreadLocalBuffer();
-        for (int64_t blockId = tid; blockId < numBlocks;
-             blockId += numThreads) {
-          func(blockId, vecs);
-        }
-      });
-}
-
-void ParameterServer2::blockTraverse(
-    BlockInfo& info,
-    const ParameterConfig& config,
-    int64_t offset,
-    size_t size,
-    const VectorPtr vecs[],
-    const ParameterOptimizer::TraverseCallback& callback) {
-  /// setup sub bufs
-  for (const auto type : info.optimizer->getParameterTypes()) {
-    vecs[type]->subVecFrom(*vectors_[type], offset, size);
-  }
-  callback(vecs, config, config.sparse_remote_update() ? 0 : -1LU);
-}
-
-void ParameterServer2::op_SGD(const Operation& operation,
-                              OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  if (allClientPassFinish_) {
-    /// when all clients signal pass finished, the update
-    /// is empty.
-    return;
-  }
-
-  {
-    parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = getParameterConfig(blockId);
-      int64_t offset = info.offset;
-      size_t size = config.parameter_block_size();
-
-      info.optimizer->startBatch(numSamplesProcessed_);
-
-      for (const auto type : info.optimizer->getParameterTypes()) {
-        vecs[type]->subVecFrom(*vectors_[type], offset, size);
-      }
-      info.optimizer->update(
-          vecs, config, config.sparse_remote_update() ? 0 : -1LU);
-      vecs[PARAMETER_GRADIENT]->zeroMem();
-
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    });
-  }
-
-  batchId_++;
-}
-
-void ParameterServer2::op_start_pass(const Operation& operation,
-                                     OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    info.optimizer->startPass();
-  });
-}
-
-void ParameterServer2::op_finish_pass(const Operation& operation,
-                                      OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    size_t size = config.parameter_block_size();
-
-    /// catch up with
-    if (auto callback = info.optimizer->startCatchUpWith()) {
-      blockTraverse(info, config, info.offset, size, vecs, callback);
-      info.optimizer->finishCatchUpWith();
-    }
-
-    /// finish pass
-    info.optimizer->finishPass();
-  });
-  batchId_ = 0;
-}
-
-void ParameterServer2::op_apply(const Operation& operation,
-                                OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    int64_t offset = info.offset;
-    size_t size = config.parameter_block_size();
-
-    // catch up with
-    if (auto callback = info.optimizer->startCatchUpWith()) {
-      blockTraverse(info, config, offset, size, vecs, callback);
-      info.optimizer->finishCatchUpWith();
-    }
-
-    // apply to PARAMETER_APPLY
-    if (auto callback = info.optimizer->apply()) {
-      blockTraverse(info, config, offset, size, vecs, callback);
-    }
-  });
-}
-
-void ParameterServer2::op_randomize(const Operation& operation,
-                                    OperationResult* result) {
-  LOG(INFO) << "ParameterServer2::op_randomize: serverId=" << serverId_;
-
-  CpuVector& valueVec = *vectors_[PARAMETER_VALUE];
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    size_t size = config.parameter_block_size();
-
-    vecs[PARAMETER_VALUE]->subVecFrom(valueVec, info.offset, size);
-    Parameter::randomize(vecs[PARAMETER_VALUE], config);
-  });
-}
-
-void ParameterServer2::loadValueVector(const LoadValueRequest& request,
-                                       ProtoResponseCallback callback) {
-  LoadValueResponse response;
-  LOG(INFO) << "ParameterServer2::loadValueVector: serverId=" << serverId_;
-
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
-  std::string filename = request.dir_name() + buf;
-
-  std::ifstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-
-  CpuVector& vec = *vectors_[PARAMETER_VALUE];
-  Parameter::Header header;
-  CHECK(fs.read(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to read parameters in pserver";
-  CHECK(Parameter::isHeaderFormatSupported(header.format))
-      << "Incorrect format version: " << header.format;
-  CHECK_EQ(header.size, (size_t)size_)
-      << "The size (" << header.size << ") in the file does not match the size "
-      << "(" << size_ << ") of the pserver: " << serverId_;
-  CHECK_EQ(header.valueSize, sizeof(real)) << "Unsupported valueSize "
-                                           << header.valueSize;
-  CHECK(fs.read(reinterpret_cast<char*>(vec.getData()),
-                header.size * sizeof(real)));
-
-  callback(response);
-}
-
-void ParameterServer2::saveValueVector(const SaveValueRequest& request,
-                                       ProtoResponseCallback callback) {
-  SaveValueResponse response;
-  LOG(INFO) << "ParameterServer2::SaveValueVector: serverId=" << serverId_;
-
-  mkDir(request.dir_name().c_str());
-
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
-  std::string filename = request.dir_name() + buf;
-
-  std::ofstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-
-  CpuVector& vec = vectors_[PARAMETER_APPLY] ? *vectors_[PARAMETER_APPLY]
-                                             : *vectors_[PARAMETER_VALUE];
-  Parameter::Header header;
-  // TODO(TJ): save param headerFormat_
-  header.format = PARAM_FORMAT_ORIGINAL;
-  header.valueSize = sizeof(real);
-  header.size = size_;
-
-  CHECK_EQ(header.size, vec.getSize());
-
-  CHECK(fs.write(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to write parameter in pserver: " << serverId_;
-
-  CHECK(fs.write(reinterpret_cast<char*>(vec.getData()),
-                 header.size * sizeof(real)))
-      << "Fail to write parameter in pserver: " << serverId_;
-
-  callback(response);
-}
-
-void ParameterServer2::op_RESET(const Operation& operation,
-                                OperationResult* result) {
-  (void)result;
-  CpuVector* u = vectors_[operation.pvectors(0)].get();
-  u->reset(operation.scalars(0));
-  clearUnusedSegments(u);
-}
-
-void ParameterServer2::op_utv(const Operation& operation,
-                              OperationResult* result) {
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  double sum = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    sum += (double)u[i] * (double)v[i];
-  }
-  result->add_scalars(sum);
-}
-
-void ParameterServer2::op_au_bv(const Operation& operation,
-                                OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  real b = operation.scalars(1);
-  for (int64_t i = 0; i < size; ++i) {
-    v[i] = a * u[i] + b * v[i];
-  }
-}
-
-void ParameterServer2::op_COPY(const Operation& operation,
-                               OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    v[i] = u[i];
-  }
-}
-
-void ParameterServer2::op_au(const Operation& operation,
-                             OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  for (int64_t i = 0; i < size; ++i) {
-    u[i] *= a;
-  }
-}
-
-void ParameterServer2::op_au_bv_cw(const Operation& operation,
-                                   OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  real* w = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  real b = operation.scalars(1);
-  real c = operation.scalars(2);
-  for (int64_t i = 0; i < size; ++i) {
-    w[i] = a * u[i] + b * v[i] + c * w[i];
-  }
-}
-
-void ParameterServer2::op_make_steepest_desc_dir(const Operation& operation,
-                                                 OperationResult* result) {
-  (void)result;
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* grad = vectors_[operation.pvectors(1)]->getData();
-  real* x = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  for (int64_t i = 0; i < size; ++i) {
-    if (x[i] < 0) {
-      dir[i] = -grad[i] + l1weight;
-    } else if (x[i] > 0) {
-      dir[i] = -grad[i] - l1weight;
-    } else {
-      if (grad[i] < -l1weight) {
-        dir[i] = -grad[i] - l1weight;
-      } else if (grad[i] > l1weight) {
-        dir[i] = -grad[i] + l1weight;
-      } else {
-        dir[i] = 0;
-      }
-    }
-  }
-}
-
-void ParameterServer2::op_fix_dir_signs(const Operation& operation,
-                                        OperationResult* result) {
-  (void)result;
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* steepestDescDir = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    if (dir[i] * steepestDescDir[i] <= 0) {
-      dir[i] = 0;
-    }
-  }
-}
-
-void ParameterServer2::op_fix_omega_signs(const Operation& operation,
-                                          OperationResult* result) {
-  (void)result;
-  real* x = vectors_[operation.pvectors(0)]->getData();
-  real* newx = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    if (x[i] * newx[i] < 0) {
-      newx[i] = 0;
-    }
-  }
-}
-
-void ParameterServer2::op_dir_deriv(const Operation& operation,
-                                    OperationResult* result) {
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* grad = vectors_[operation.pvectors(1)]->getData();
-  real* x = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  double sum = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    if (dir[i] != 0) {
-      if (x[i] < 0) {
-        sum += dir[i] * (grad[i] - l1weight);
-      } else if (x[i] > 0) {
-        sum += dir[i] * (grad[i] + l1weight);
-      } else if (dir[i] < 0) {
-        sum += dir[i] * (grad[i] - l1weight);
-      } else if (dir[i] > 0) {
-        sum += dir[i] * (grad[i] + l1weight);
-      }
-    }
-  }
-  result->add_scalars(sum);
-}
-
-void ParameterServer2::op_cost(const Operation& operation,
-                               OperationResult* result) {
-  real* x = vectors_[operation.pvectors(0)]->getData();
-  real* newgrad = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  real l2weight = operation.scalars(1);
-  double cost_real = cost_ / mpiSize_;
-  double sum_weight_l1 = 0;
-  double sum_weight_l2 = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    sum_weight_l1 += std::abs(x[i]);
-    sum_weight_l2 += x[i] * x[i];
-    newgrad[i] += 2.0 * l2weight * x[i];
-  }
-  cost_real += l1weight * sum_weight_l1 + l2weight * sum_weight_l2;
-  result->add_scalars(cost_real);
-}
-
-ParameterServer2::OperatorFunction ParameterServer2::opFuncs[] = {
-    nullptr,                         // PSERVER_OP_utu = 0;
-    &ParameterServer2::op_utv,       // PSERVER_OP_utv = 1;
-    &ParameterServer2::op_au,        // PSERVER_OP_au = 2;
-    &ParameterServer2::op_au_bv,     // PSERVER_OP_au_bv = 3;
-    nullptr,                         // PSERVER_OP_aAx_bu = 4;
-    &ParameterServer2::op_SGD,       // PSERVER_OP_SGD = 5;
-    &ParameterServer2::op_RESET,     // PSERVER_OP_RESET = 6;
-    &ParameterServer2::op_COPY,      // PSERVER_OP_COPY = 7;
-    &ParameterServer2::op_au_bv_cw,  // PSERVER_OP_au_bv_cw = 8;
-    &ParameterServer2::op_make_steepest_desc_dir,
-    /// PSERVER_OP_MAKE_STEEPEST_DESC_DIR = 9;
-    &ParameterServer2::op_fix_dir_signs,    // PSERVER_OP_FIX_SIGNS = 10;
-    &ParameterServer2::op_dir_deriv,        // PSERVER_OP_DIR_DERIV = 11;
-    &ParameterServer2::op_fix_omega_signs,  // PSERVER_OP_FIX_OMEGA_SIGNS = 12;
-    &ParameterServer2::op_cost,             // PSERVER_OP_COST = 13
-    &ParameterServer2::op_start_pass,       // PSERVER_OP_START_PASS = 14
-    &ParameterServer2::op_finish_pass,      // PSERVER_OP_FINISH_PASS = 15
-    &ParameterServer2::op_randomize,        // PSERVER_OP_RANDOMIZE = 16
-    &ParameterServer2::op_apply,            // PSERVER_OP_APPLY = 17
-};
-
-void ParameterServer2::doOperation(const DoOperationRequest& request,
-                                   ProtoResponseCallback callback) {
-  if (request.wait_for_gradient()) {
-    /// wait gradient update
-    gradientReadyBarrier_.wait();
-    allClientPassFinish_ = numPassFinishClients_ == FLAGS_num_gradient_servers;
-  }
-
-  DoOperationResponse response;
-  response.set_pass_finish(allClientPassFinish_);
-
-  for (const auto& op : request.operations()) {
-    OperationResult* opResult = response.add_results();
-    if (op.operation() >= ARRAYSIZE(opFuncs)) {
-      LOG(ERROR) << "Unknown operation " << op.operation();
-      response.set_return_message(kRetMsgUnknownOperation);
-    }
-    OperatorFunction opFunc = opFuncs[op.operation()];
-    if (!opFunc) {
-      LOG(ERROR) << "Operation not implemented: " << op.operation();
-      response.set_return_message(kRetMsgUnknownOperation);
-    }
-    (this->*opFunc)(op, opResult);
-  }
-
-  if (request.send_back_parameter()) {
-    /// clean current cost
-    cost_ = 0;
-
-    if (allClientPassFinish_ && request.release_pass()) {
-      /// This signals that all clients finish one pass, so waitPassFinish()
-      /// will stop waiting.
-      numPassFinishClients_ = 0;
-    }
-
-    /// notify addGradient() to send back parameter
-    parameterReadyBarrier_.wait();
-  }
-  callback(response);
-}
-
-void ParameterServer2::waitPassStart(const WaitPassStartRequest& request,
-                                     ProtoResponseCallback callback) {
-  passBarrier_.wait();
-  callback(WaitPassStartResponse());
-}
-
-void ParameterServer2::waitPassFinish(const WaitPassFinishRequest& request,
-                                      ProtoResponseCallback callback) {
-  numPassFinishClients_ += 1;
-
-  while (numPassFinishClients_ != 0) {
-    /// notify doOperation gradient ready
-    gradientReadyBarrier_.wait();
-    /// wait doOperation finish
-    parameterReadyBarrier_.wait();
-  }
-
-  callback(WaitPassFinishResponse());
-}
-
-void ParameterServer2::synchronize(const SynchronizeRequest& request,
-                                   ProtoResponseCallback callback) {
-  synchronizeBarriers_[request.sync_object_id()]->wait();
-  dataSize_ = 0;
-  callback(SynchronizeResponse());
-}
-
-void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
-                                       ProtoResponseCallback callback) {
-  synchronizeBarriers_[request.sync_object_id()]->wait();
-  callback(SynchronizeResponse());
-
-  if (request.trainer_id() == 0) {
-    batchId_ = 0;
-  }
-}
-
-void ParameterServer2::createVector(const CreateVectorRequest& request,
-                                    ProtoResponseCallback callback) {
-  (void)request;
-  CreateVectorResponse response;
-  LOG(INFO) << "ParameterServer2::createVector: size=" << size_;
-  CpuVectorPtr vec = std::make_shared<CpuVector>(size_);
-  int64_t handle = -1;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    handle = vectors_.size();
-    vectors_.push_back(vec);
-  }
-  response.set_handle(handle);
-  callback(response);
-}
-
-void ParameterServer2::releaseVector(const ReleaseVectorRequest& request,
-                                     ProtoResponseCallback callback) {
-  ReleaseVectorResponse response;
-  CpuVectorPtr vec;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    vec.swap(vectors_[request.handle()]);
-  }
-  callback(response);
-}
-
-void ParameterServer2::createMatrix(const CreateMatrixRequest& request,
-                                    ProtoResponseCallback callback) {
-  CreateMatrixResponse response;
-  /// We need to create column major matrix of size_ * num_cols
-  /// Matrix is row majoar. Need to tranpose when use it.
-  CpuMatrixPtr mat = std::make_shared<CpuMatrix>(request.num_cols(), size_);
-  int64_t handle = -1;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    handle = matrices_.size();
-    matrices_.push_back(mat);
-  }
-  response.set_handle(handle);
-  callback(response);
-}
-
-void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
-                                     ProtoResponseCallback callback) {
-  ReleaseMatrixResponse response;
-  CpuMatrixPtr mat;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    mat.swap(matrices_[request.handle()]);
-  }
-  callback(response);
-}
-
-}  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
deleted file mode 100644
index 0b8ef5c170c01ec8a5d53f01db9888f82ca68eec..0000000000000000000000000000000000000000
--- a/paddle/pserver/ParameterServer2.h
+++ /dev/null
@@ -1,696 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-#include <limits>
-#include <mutex>
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-#include <vector>
-
-#include <stddef.h>
-#include <stdlib.h>
-
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterOptimizer.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/ThreadLocal.h"
-
-#include "ParameterService.pb.h"
-
-#include "ProtoServer.h"
-
-DECLARE_int32(port);
-
-namespace paddle {
-
-// @TODO(yanfei):
-// if armed with high density computation resource per node, pserver could also
-// utilize GPU to reduce overhead. if this mechanism is used, it could pipeline
-// network receiving and GPU computation to reduce the network overhead even
-// further. the pipeline could help to accelerate BIG model training.
-// @TODO:(yanfei)
-// for cpu and less/low gpu machine, the time exhausted by forward and backward
-// could be larger than optimization at pserver. However, if armed with lots of
-// gpus per node and if the model size is so large enough that limited cpu
-// computation causes big optmization latency, the GPU may be required by
-// pserver.
-
-/**
- * Client interface for the parameter server
- *
- * it implements several rpc API for remote parameter client usage.
- * for sync-sgd, client needs one controller thread to build connections
- * to all pservers, these controller connections do barriers
- * synchronization with these connections used for transfering data.
- * each data connection uses block based fine grained synchronization
- * to gain better scalability. Merging gradients from different trainers
- * are concurrently executed with block units, so that some network
- * overhead will be hidden in merging gradient.
- * for async-sgd, the difference is that pserver will do optimization
- * immediately if the gradients are ready, so that pserver needs to
- * prepare separate buffer to store value for sending back to trainer
- * to prevent from being polluted.
- */
-class ParameterServer2 : public ProtoServer {
- protected:
-  /// parameter_ mutex.
-  RWLock parameterMutex_;
-
-  typedef std::pair<size_t, int64_t> BlockKey;
-  struct BlockKeyHash {
-    size_t operator()(const BlockKey& key) const {
-      return std::hash<size_t>()(key.first) + key.second;
-    }
-  };
-
-  // TODO(yanfei):
-  // if index data structure is based on parameters instead of blocks, the
-  // lookup performance could be better. In addition, the block memory
-  // access almost exhibits good locality, so index data structure and
-  // block data structure can be refined further, especially if gpu is used
-  // for pserver.
-  /**
-   * all parameters are stored in CpuVector with a blockMap_ data structure
-   * to index block data required by requests.
-   */
-  typedef std::unordered_map<BlockKey, int64_t, BlockKeyHash> BlockMap;
-  /// <(para, block), global offset(byte) in all parameters>
-  BlockMap blockOffsetMap_;
-  /// <(para, block), global idx [0, nBlocksInAllParameters]>
-  BlockMap blockIdMap_;
-
-  std::vector<CpuVectorPtr> vectors_;
-  std::vector<CpuMatrixPtr> matrices_;
-  std::vector<CpuMemHandlePtr> dataMems_;
-
-  // TODO(yanfei):
-  // if storing sparse_remote_update() flag in request instead of
-  // reading configMap_, and storing config within new block wise
-  // overview data structure, the config mapping, block mapping
-  // can be unified in single clean data structure. Use para_id
-  // to index parameters, use offset to index block within parameter
-  // and keep two index into single one.
-  /**
-   * mapping between parameter and config
-   * different parameter allows different config, such as decay_rate.
-   * for each request, it need to read config for adding gradient
-   * and optmization.
-   */
-  std::unordered_map<size_t, ParameterConfig> configMap_;
-
-  /**
-   * to parallelize the multi-thread and multi-connnection
-   * computation at pserver, it use block unit to reduce
-   * the contention for computation, even further use block
-   * level optimizater control for each block for some special
-   * reason annotated below.
-   */
-  struct BlockInfo {
-    const ParameterConfig* config;
-    std::unique_ptr<std::mutex> lock;
-    /// global offset for all parameters
-    uint64_t offset;
-    /**
-     *
-     * Async sgd in pserver is very different from sync sgd.
-     * Each trainer follows startBatch, update*, finishBatch as in
-     * sync sgd, but all these actions are almost executed by
-     * multi-core and multi-thread simutaneously, so that async
-     * sgd optimization is based on block level in reality, then
-     * per block optimization is necessary indeed. In addition,
-     * per block optimization is also perfered for performance
-     * with multithreads.
-     */
-    std::unique_ptr<ParameterOptimizer> optimizer;
-  };
-  std::vector<BlockInfo> blockInfos_;
-
-  typedef std::vector<std::pair<int64_t, int64_t>> BlockSegments;
-  /// Because some blocks might not be fully used. We keep a
-  /// record of which segments are used.
-  BlockSegments usedSegments_;
-
-  /// record pserver status, all status defined in ParameterService.pb
-  PServerStatus status_;
-  /// record all samples processed which could be used by optimizater
-  std::atomic<int64_t> numSamplesProcessed_;
-  double cost_;
-  int mpiSize_;
-  int dataSize_;
-  /// configuration for current parameter optimizer
-  OptimizationConfig config_;
-
-  /**
-   * The ReadWriteBuffer is based on std::vector, but aligned for avx/sse
-   * compute. And add some helper method to allocate memory aligned blocks.
-   *
-   * @param T          type of element.
-   * @param AlignBytes the memory aligned bytes for allocated blocks.
-   */
-  template <typename T, size_t AlignBytes>
-  class ReadWriteBuffer
-      : public std::vector<T, AlignedAllocator<T, AlignBytes>> {
-   public:
-    static_assert(sizeof(T) % AlignBytes == 0 || AlignBytes % sizeof(T) == 0,
-                  "Type T must be able to aligned.");
-
-    /**
-     * @brief IsTLargerThanAlign compiled time calculated constant for is type
-     * T larger than alignments.
-     */
-    constexpr static bool IsTLargerThanAlign = sizeof(T) >= AlignBytes;
-
-    static_assert(std::is_pod<T>::value, "T must be POD type.");
-
-    /**
-     * @brief if AlignBytes > sizeof(T), then will calcuate how many elements
-     * can be stored in AlignBytes.
-     */
-    constexpr static size_t AlignElementCount = AlignBytes / sizeof(T);
-
-    static_assert(AlignElementCount ==
-                          (AlignElementCount & -AlignElementCount) ||
-                      AlignBytes > sizeof(T),
-                  "AlignElementCount should be exp of 2");
-
-    /**
-     * @brief Resize Buffer, with block count that will be allocated. Each block
-     * will be memory aligned in AlignBytes.
-     * @param size The element count in all blocks.
-     * @param alignBlockCount The block count that will be allocated.
-     */
-    void resizeWithAlignHints(size_t size, size_t alignBlockCount = 1) {
-      if (IsTLargerThanAlign) {  //! So, each elements is memory aligned.
-        this->resize(size);
-      } else {
-        //! at most, we need such elements in buffer to make sure each block is
-        //! aligned.
-        this->resize(size + alignBlockCount * (AlignElementCount - 1));
-      }
-    }
-
-    /**
-     * @brief reset aligned allocate blocks.
-     */
-    void resetAlignAlloc() { this->curOffset_ = 0; }
-
-    /**
-     * @brief get next aligned block address.
-     * @param blockSize is the element count in each block.
-     * @return Aligned block address.
-     */
-    T* nextBlock(size_t blockSize) {
-      T* r = &this->operator[](curOffset_);
-      curOffset_ += blockSize;
-
-      if (!IsTLargerThanAlign) {
-        curOffset_ =
-            (curOffset_ + AlignElementCount - 1) & ~(AlignElementCount - 1);
-      }
-      return r;
-    }
-
-   private:
-    size_t curOffset_;
-  };
-
-  /// to buffer the data from network for further processing to
-  /// reduce redundant memory allocation.
-  ThreadLocal<ReadWriteBuffer<real, ALIGN_HINT>> readWriteBuffer_;
-
-  /// size of the parameter
-  int64_t size_;
-
-  /// for synchronized training, check details in addGradient()
-  /// and doOperation()
-  ThreadBarrier gradientReadyBarrier_;
-  ThreadBarrier parameterReadyBarrier_;
-  ThreadBarrier passBarrier_;
-  ThreadLocal<std::vector<SendParameterRequest>> requestVec_;
-  ThreadLocal<std::vector<ProtoResponseCallbackEx>> callbackVec_;
-
-  std::atomic<int> numPassFinishClients_;
-  bool allClientPassFinish_;
-
-  std::vector<std::unique_ptr<ThreadBarrier>> synchronizeBarriers_;
-  std::atomic<int> serverId_;
-
-  /**
-   *
-   * for lagged async gradient gradient commit control in Async Sgd.
-   * discard lagged gradients from too slow nodes, whose gradients
-   * exhibits bad quality.
-   * Algorithm:
-   * pserver:
-   * 1. initial asyncUpdaterSteps = 0, asyncTrainerSteps_[N] = 0.
-   * syncUpdaterSteps means
-   *    the version of parameter value.
-   * 2. when pull arrives, record asyncUpdateSteps_ into
-   * syncTrainerSteps_[trainer_id]
-   * 3. when push arrives, compare asyncUpdateSteps_ with
-   * syncTrainerSteps_[trainer_id]
-   *    if delta > threshold, discard current gradient, else commit
-   *    gradient.
-   * 4. reset asyncUpdaterSteps_ and asyncTrainerSteps_[N] when pass
-   * finished
-   * Note:
-   * it can not discard all lag-gradient strictly in some special
-   * condition. part of gradients could be discarded if
-   * ConcurrentRemoteParameterUpdater is sed.
-   * this algorithm is implemented in asynSGD()
-   */
-  int64_t asyncLaggedThreshold_;
-  std::atomic<int64_t> asyncUpdateSteps_;
-  std::vector<int64_t> asyncTrainerSteps_;
-  size_t asyncLaggedGradientsNum_;
-  /// stat all async update
-  std::vector<size_t> asyncUpdateStat_;
-  /// stat per trainer_id
-  std::vector<size_t> asyncTrainerDiscardStat_;
-  /// stat per trainer_id
-  std::vector<size_t> asyncTrainerCommitStat_;
-
-  /// only used by controller and other control cmd from trainer number 0
-  std::unique_ptr<SyncThreadPool> syncThreadPool_;
-
-  /// pserver for sparse remote update parameters
-  bool isSparseServer_;
-
-  /// barrier performance tuning sync-sgd required
-  std::atomic<int64_t> batchId_;
-
- public:
-  struct Buffer {
-    real* base;
-    size_t size;
-  };
-
- protected:
-  /// async gradient commit control
-  bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
-
- public:
-  /// disable default parameter for overloading
-  /// @rdmaCpu:the id of cpu core hosting RDMA server(0-N)
-  /// -1 means using TCP transport instead of RDMA
-  ParameterServer2(const std::string& addr, int port, int rdmaCpu = -1);
-
-  ~ParameterServer2() {}
-
-  static const std::string kRetMsgInvalidMatrixHandle;
-  static const std::string kRetMsgInvalidVectorHandle;
-  static const std::string kRetMsgUnknownOperation;
-
-  /// service functions
-  template <typename Dtype>
-  void reduceAndSendData(const SendDataRequest& request,
-                         std::unique_ptr<MsgReader>& msgReader,
-                         ProtoResponseCallbackEx& callback);
-
-  void templateReduceSum(const SendDataRequest& request,
-                         std::unique_ptr<MsgReader>& msgReader,
-                         ProtoResponseCallbackEx& callback);
-
-  /**
-   * @brief framework for sending parameters
-   *
-   * @note  different parameter data type can be sent to pserver.
-   *        in most case, the api is used to send gradients from
-   *        trainer to pserver.
-   *        it also can be used to retrieve parameters from pserver
-   */
-  void sendParameter(const SendParameterRequest& request,
-                     std::unique_ptr<MsgReader> msgReader,
-                     ProtoResponseCallbackEx callback);
-
-  void sendData(const SendDataRequest& request,
-                std::unique_ptr<MsgReader> msgReader,
-                ProtoResponseCallbackEx callback);
-
-  /**
-   * @brief send config to pserver
-   *
-   * @note  it can help pserver to understand the configuration for
-   * optimization,
-   *        logging control, duplicated initialization, etc.
-   */
-  void setConfig(const SetConfigRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief get status for pserver
-   *
-   * @note  used to check if parameters are ready at pserver
-   */
-  void getStatus(const GetStatusRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief set status for pserver
-   *
-   * @note  used to check if parameters are ready at pserver, since parameters
-   *        at pserver are initialized by trainer
-   */
-  void setStatus(const SetStatusRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief framework for doing some operation at pserver end
-   *
-   * @note  if sync-sgd is used, controller will calling op_SGD action
-   *        for gradient optimization.
-   *        check avaiable operations in opFuncs[]
-   */
-  void doOperation(const DoOperationRequest& request,
-                   ProtoResponseCallback callback);
-
-  /// Create a column vector. The size is the dimension of parameter
-  void createVector(const CreateVectorRequest& request,
-                    ProtoResponseCallback callback);
-
-  void releaseVector(const ReleaseVectorRequest& request,
-                     ProtoResponseCallback callback);
-
-  /// Create a column major matrix. The number of rows is the dimension of
-  /// parameter. The number of columns is specifed by num_cols.
-  void createMatrix(const CreateMatrixRequest& request,
-                    ProtoResponseCallback callback);
-
-  void releaseMatrix(const ReleaseMatrixRequest& request,
-                     ProtoResponseCallback callback);
-  /**
-   * @brief stateful control for indicationg sync pass start
-   *
-   * @note  it is valuable for logging and state control,
-   *        especially for sync-sgd control
-   */
-  void waitPassStart(const WaitPassStartRequest& request,
-                     ProtoResponseCallback callback);
-
-  /**
-   * @brief stateful control for indicationg sync pass end
-   *
-   * @note  it is valuable for logging and state control,
-   *        especially for sync-sgd control
-   */
-  void waitPassFinish(const WaitPassFinishRequest& request,
-                      ProtoResponseCallback callback);
-
-  /**
-   * @brief synchronize all distributed trainers
-   *
-   * @note  it's general api for synchronizing trainer and pserver
-   */
-  void synchronize(const SynchronizeRequest& request,
-                   ProtoResponseCallback callback);
-
-  /**
-   * @brief stateful control for indicating async pass is finished
-   *
-   * @note  it is valuable for logging control, state reset, etc.
-   */
-  void asyncFinishPass(const SynchronizeRequest& request,
-                       ProtoResponseCallback callback);
-
-  void loadValueVector(const LoadValueRequest& request,
-                       ProtoResponseCallback callback);
-
-  void saveValueVector(const SaveValueRequest& request,
-                       ProtoResponseCallback callback);
-
- public:
-  /**
-   * @brief initialize parameter server
-   */
-  bool init();
-
-  /**
-   * @brief set parameters at pserver
-   *
-   * @note  do parameter initialization if neccessy.
-   */
-  void setParameter(const SendParameterRequest& request,
-                    std::vector<Buffer>& inputBuffers,
-                    SendParameterResponse* response,
-                    std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief receive gradients and do optimization for async-sgd
-   *
-   * @note  this api asynchronizately receives all data from all
-   *        trainers, and immediately do optimization and return
-   *        optimizated value for trainer.
-   *        this above routine are block based atomic updating,
-   *        which means different block could based different stale
-   *        gradient.
-   *        it will discard some lagged gradients by default for
-   *        better convergence.
-   */
-  void asyncSGD(const SendParameterRequest& request,
-                std::vector<Buffer>& inputBuffers,
-                SendParameterResponse* response,
-                std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief merge gradients from all trainer
-   *
-   * @note  this api use block based parallelization as fine grained
-   *        parallelization which benifits lock contention and latency
-   *        hidden for communication, also can harness multi-core
-   *        efficiently.
-   *        it also implements the synchronization for sync-sgd
-   */
-  void addGradient(const SendParameterRequest& request,
-                   std::vector<Buffer>& inputBuffers,
-                   SendParameterResponse* response,
-                   std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief get dense parameters from pserver
-   *
-   * @note  for some specified condition, trainer will get parameters from
-   *        pservers.
-   *        e.g.
-   *        if all parameters are stored at perver end for big model training
-   *        trainer can use it to retrieve all parameters if necessary.
-   */
-  void getParameter(const SendParameterRequest& request,
-                    std::vector<Buffer>& inputBuffers,
-                    SendParameterResponse* response,
-                    std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief get sparse value from parameter server
-   *
-   * @note  with sparse enabled, pservers own all latest value
-   *        while trainer only retrieve value that only are needed.
-   *        e.g.
-   *        trainer will do prefetch action to retrieve necessary latest
-   *        value from pserver for sparse calculation.
-   */
-  void getParameterSparse(const SendParameterRequest& request,
-                          std::vector<Buffer>& inputBuffers,
-                          SendParameterResponse* response,
-                          std::vector<Buffer>* outputBuffers);
-
- protected:
-  void mergeSegments(BlockSegments* segments);
-
-  /// set the unused segments to zero
-  void clearUnusedSegments(CpuVector* vec);
-
-  // TODO(yanfei):
-  // if read data and do optimization interleavely block by block,
-  // the performance could be better for gaining less network congestion.
-  /// read all data from connection and store it in static pre-allocated buffer
-  void readAllBlocks(MsgReader* msgReader,
-                     std::vector<ParameterServer2::Buffer>* buffers);
-
-  const ParameterConfig& getParameterConfig(const ParameterBlock& block) {
-    CHECK_LT(block.para_id(), -1UL) << "invalid parameter id:"
-                                    << block.para_id();
-    const auto it = configMap_.find(block.para_id());
-    CHECK(it != configMap_.end()) << "can not find parameter id: "
-                                  << block.para_id();
-    return it->second;
-  }
-
-  /// it implictly check blockOffsetMap_ while retrieving blockId
-  const ParameterConfig& getParameterConfig(int64_t blockId) const {
-    CHECK(blockId >= 0 && blockId < (int64_t)blockInfos_.size())
-        << "block idx out of range, id: " << blockId
-        << " info size: " << blockInfos_.size();
-    return *(blockInfos_[blockId].config);
-  }
-
-  template <class Response>
-  bool isValidVectorHandle(int64_t handle, Response* response) {
-    if (handle < 0 || (size_t)handle >= vectors_.size()) {
-      LOG(ERROR) << "Invalid vector handle " << handle;
-      response->set_return_message(kRetMsgInvalidVectorHandle);
-      return false;
-    }
-    return true;
-  }
-
-  template <class Response>
-  bool isValidMatrixHandle(int64_t handle, Response* response) {
-    if (handle < 0 || (size_t)handle >= matrices_.size()) {
-      LOG(ERROR) << "Invalid matrix handle " << handle;
-      response->set_return_message(kRetMsgInvalidMatrixHandle);
-      return false;
-    }
-    return true;
-  }
-
-  /**
-   * @brief get block offset
-   *
-   * @note  block.begin_dim is added to the block offset.
-   *        return -1 if block cannot be found
-   */
-  int64_t getBlockOffset(const ParameterBlock& block) const {
-    BlockKey key(block.para_id(), block.block_id());
-    auto it = blockOffsetMap_.find(key);
-    if (it == blockOffsetMap_.end()) {
-      return -1;
-    }
-    return it->second;
-  }
-
-  /// return -1 if block cannot be found
-  int64_t getBlockId(const ParameterBlock& block) const {
-    BlockKey key(block.para_id(), block.block_id());
-    auto it = blockIdMap_.find(key);
-    if (it == blockIdMap_.end()) {
-      return -1;
-    }
-    return it->second;
-  }
-
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  modify reponse and outputBuffers for sending parameter
-   *        back to client. The buffer for socket sending uses
-   *        vectors_[parameterType] directly
-   *        for dense with sync-sgd
-   */
-  void sendBackParameter(const ParameterBlock& block,
-                         int parameterType,
-                         SendParameterResponse* response,
-                         std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  modify response and outputBuffers for sending parameter
-   *        back to client. The buffer for socket sending uses buffer->base
-   *        The parameter values are copied from vectors_[parameterType]
-   *        to buffer->base.
-   *        for dense with async-sgd
-   */
-  void sendBackParameter(const ParameterBlock& block,
-                         int parameterType,
-                         SendParameterResponse* response,
-                         Buffer* buffer,
-                         std::vector<Buffer>* outputBuffers);
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  specified for sparse
-   */
-  void sendBackParameterSparse(const ParameterBlock& block,
-                               int parameterType,
-                               SendParameterResponse* response,
-                               Buffer* buffer,
-                               size_t width,
-                               std::vector<Buffer>* outputBuffers);
-
-  /**
-   * framework routine for block parallelization
-   * e.g.
-   * for optimization on all blocks at pserver end, this routine can facilitize
-   * the parallelize of do optimization on all blocks with multithreads.
-   */
-  typedef std::function<void(int64_t blockId, const VectorPtr vecs[])> ExecFunc;
-  void parallelExecForEachBlock(ExecFunc func);
-  void blockTraverse(BlockInfo& info,
-                     const ParameterConfig& config,
-                     int64_t offset,
-                     size_t size,
-                     const VectorPtr vecs[],
-                     const ParameterOptimizer::TraverseCallback& callback);
-
- public:
-  typedef void (ParameterServer2::*OperatorFunction)(const Operation& operation,
-                                                     OperationResult* result);
-
-  /**
-   * doOperation will call following operations indirectly
-   * e.g.
-   * for sync-sgd control, the controller in remote updater will send op_SGD
-   * command to pserver, then send sendParameter request to pserver immediately.
-   * the two function at pserver end will do cooperation to achieve the sync-sgd
-   * gradient merge and optimization.
-   * the most following operations are specified for owlqn, all operations are
-   * under the context of doOperation function
-   */
-  static OperatorFunction opFuncs[];
-
-  void op_SGD(const Operation& operation, OperationResult* result);
-
-  void op_RESET(const Operation& operation, OperationResult* result);
-
-  void op_utv(const Operation& operation, OperationResult* result);
-
-  void op_au_bv(const Operation& operation, OperationResult* result);
-
-  void op_COPY(const Operation& operation, OperationResult* result);
-
-  void op_au(const Operation& operation, OperationResult* result);
-
-  void op_au_bv_cw(const Operation& operation, OperationResult* result);
-
-  void op_make_steepest_desc_dir(const Operation& operation,
-                                 OperationResult* result);
-
-  void op_fix_dir_signs(const Operation& operation, OperationResult* result);
-
-  void op_dir_deriv(const Operation& operation, OperationResult* result);
-
-  void op_fix_omega_signs(const Operation& operation, OperationResult* result);
-
-  void op_cost(const Operation& operation, OperationResult* result);
-
-  void op_start_pass(const Operation& operation, OperationResult* result);
-  void op_finish_pass(const Operation& operation, OperationResult* result);
-
-  void op_apply(const Operation& operation, OperationResult* result);
-
-  void op_randomize(const Operation& operation, OperationResult* result);
-
-  void op_load(const Operation& operation, OperationResult* result);
-  void op_save(const Operation& operation, OperationResult* result);
-};
-
-}  // namespace paddle
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
deleted file mode 100644
index 206cd17c379f529579c103893cfb492524bc6f8d..0000000000000000000000000000000000000000
--- a/paddle/pserver/test/SocketTest.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Util.h"
-
-#include <netdb.h>
-#include <netinet/in.h>
-#include <stdio.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-
-#include <thread>
-
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Logging.h"
-
-struct MessageHeader {
-  int64_t dataLength;
-};
-
-class Thread {
- public:
-  void start();
-  virtual void run() = 0;
-  virtual ~Thread() {}
-
- protected:
-  std::unique_ptr<std::thread> thread_;
-};
-
-void Thread::start() {
-  thread_.reset(new std::thread([this]() { this->run(); }));
-}
-
-class SocketChannel {
- public:
-  explicit SocketChannel(int socket) : socket_(socket) {}
-  int getSocketFd() const { return socket_; }
-  uint64_t readAll(void* buf, size_t size);
-  uint64_t writeAll(const void* buf, size_t size);
-
- protected:
-  int socket_;
-};
-
-uint64_t SocketChannel::readAll(void* buf, size_t size) {
-  uint64_t total = 0;
-  while (total < size) {
-    int64_t len = read(socket_, (char*)buf + total, size - total);
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-uint64_t SocketChannel::writeAll(const void* buf, size_t size) {
-  uint64_t total = 0;
-  while (total < size) {
-    int64_t len = write(socket_, (const char*)buf + total, size - total);
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-class SocketWorker : public Thread {
- public:
-  explicit SocketWorker(int socket) : channel_(socket) {}
-  virtual void run();
-
-  // read n bytes.
-  int64_t readAll(char* buf, size_t n);
-
-  // write n bytes
-
- protected:
-  SocketChannel channel_;
-  std::string buffer_;
-};
-
-class SocketServer : public Thread {
- public:
-  explicit SocketServer(int port)
-      : port_(port), socket_(0), maxPendingConnections_(100) {}
-
-  virtual void run();
-
- protected:
-  int port_;
-  int socket_;
-  int maxPendingConnections_;
-};
-
-void SocketServer::run() {
-  int newsockfd;
-  socklen_t clilen;
-  struct sockaddr_in serv_addr, cli_addr;
-
-  /* First call to socket() function */
-  socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(socket_ >= 0) << "ERROR opening socket";
-
-  /* Initialize socket structure */
-  bzero((char*)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_addr.s_addr = INADDR_ANY;
-  serv_addr.sin_port = htons(port_);
-
-  /* Now bind the host address using bind() call.*/
-  CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR on binding";
-
-  /* Now start listening for the clients, here process will
-   * go in sleep mode and will wait for the incoming connection
-   */
-  listen(socket_, maxPendingConnections_);
-  clilen = sizeof(cli_addr);
-
-  while (true) {
-    /* Accept actual connection from the client */
-    newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
-    CHECK(newsockfd >= 0) << "ERROR on accept";
-
-    SocketWorker* worker = new SocketWorker(newsockfd);
-    worker->start();
-  }
-}
-
-void SocketWorker::run() {
-  MessageHeader header;
-
-  while (true) {
-    int64_t n = channel_.readAll(&header, sizeof(header));
-    CHECK(n == sizeof(header)) << "ERROR reading from socket";
-
-    buffer_.resize(header.dataLength);
-    n = channel_.readAll(&buffer_[0], header.dataLength);
-    CHECK(n == header.dataLength) << "ERROR reading from socket";
-
-    /* Write a response to the client */
-    n = channel_.writeAll(&header, sizeof(header));
-    CHECK(n == sizeof(header)) << "ERROR reading from socket";
-    n = channel_.writeAll(buffer_.data(), buffer_.size());
-    CHECK(n == header.dataLength) << "ERROR writing to socket";
-  }
-}
-
-class SocketClient {
- public:
-  SocketClient(const std::string& serverAddr, int serverPort);
-  SocketChannel* getChannel() const { return channel_.get(); }
-
- protected:
-  std::unique_ptr<SocketChannel> channel_;
-};
-
-SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
-  struct sockaddr_in serv_addr;
-  struct hostent* server;
-
-  // char buffer[256];
-
-  /* Create a socket point */
-  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(sockfd >= 0) << "ERROR opening socket";
-  server = gethostbyname(serverAddr.c_str());
-  CHECK(server) << "ERROR, no such host: " << serverAddr;
-
-  bzero((char*)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  bcopy((char*)server->h_addr,
-        (char*)&serv_addr.sin_addr.s_addr,
-        server->h_length);
-  serv_addr.sin_port = htons(serverPort);
-
-  /* Now connect to the server */
-  CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR connecting";
-
-  channel_.reset(new SocketChannel(sockfd));
-}
-
-DEFINE_string(server_addr, "127.0.0.1", "Server address");
-DEFINE_int64(dim, 10000000, "Data size");
-DEFINE_int32(loop_time, 100000, "test loop time");
-
-using namespace paddle;  // NOLINT
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  SocketServer server(FLAGS_port);
-  server.start();
-  sleep(1);
-
-  SocketClient client(FLAGS_server_addr, FLAGS_port);
-
-  SocketChannel* channel = client.getChannel();
-
-  MessageHeader header;
-
-  uint64_t dataSize = FLAGS_dim * sizeof(real);
-
-#ifdef PADDLE_WITH_CUDA
-  GpuVector gpuParam(FLAGS_dim);
-  GpuVector gpuGrad(FLAGS_dim);
-#else
-  CpuVector gpuParam(FLAGS_dim);
-  CpuVector gpuGrad(FLAGS_dim);
-#endif
-  CpuVector cpuParam(FLAGS_dim);
-  CpuVector cpuGrad(FLAGS_dim);
-
-  gpuParam.rand();
-  gpuGrad.rand();
-  cpuParam.rand();
-  cpuGrad.rand();
-
-  for (int i = 0; i < FLAGS_loop_time; ++i) {
-    cpuGrad.copyFrom(gpuGrad);
-
-    header.dataLength = dataSize;
-    CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
-        << "Client write header error";
-
-    CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
-        << "Client write data error";
-
-    /* Now read server response */
-    CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
-        << "Client read header error";
-
-    CHECK_EQ((uint64_t)header.dataLength, dataSize);
-    CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
-        << "Client read data error";
-
-    gpuParam.copyFrom(cpuParam);
-
-    LOG_EVERY_N(INFO, 100) << "i=" << i;
-  }
-  exit(0);
-}
diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp
deleted file mode 100644
index 01d179258dffaf996a57022801ee3bd60a268f77..0000000000000000000000000000000000000000
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ /dev/null
@@ -1,624 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/pserver/ParameterClient2.h>
-#include <paddle/pserver/ParameterServer2.h>
-#include <paddle/utils/Flags.h>
-#include <paddle/utils/Util.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(num_gradient_servers);
-DEFINE_string(server_addr, "127.0.0.1", "assign server address");
-DEFINE_int32(server_cpu, 0, "assign server cpu");
-
-class ParameterServer2Tester : public ParameterServer2 {
- public:
-  ParameterServer2Tester(std::string serverAddr,
-                         int port,
-                         int rdmaCpu = -1,
-                         bool sepSendAndRecv = false)
-      : ParameterServer2(serverAddr, port, rdmaCpu), client_(sepSendAndRecv) {}
-  virtual ~ParameterServer2Tester() {}
-  void setup() {
-    CHECK(ParameterServer2::init());
-
-    parameters_.clear();
-    clientConfigs_.clear();
-
-    clientConfigs_.resize(2);
-    {
-      ParameterConfig& config = clientConfigs_[0];
-      config.set_name("para0");
-      config.set_para_id(0);
-      config.set_size(10000);
-      config.set_device(-1);
-      config.set_learning_rate(1.0);
-      config.set_momentum(0.9);
-    }
-
-    {
-      ParameterConfig& config = clientConfigs_[1];
-      config.set_name("para1");
-      config.set_para_id(1);
-      config.set_size(5000);
-      config.set_device(-1);
-      config.set_learning_rate(0.5);
-      config.set_momentum(0.4);
-    }
-
-    for (auto& config : clientConfigs_) {
-      parameters_.emplace_back(new Parameter(config, /* useGpu= */ false));
-    }
-
-    size_t id = 0;
-    for (auto& para : parameters_) {
-      para->setID(id++);
-    }
-
-    CHECK(client_.init(parameters_));
-    OptimizationConfig optConfig;
-    optConfig.set_algorithm("async_sgd");
-    optConfig.set_batch_size(100);
-    optConfig.set_learning_rate(0.1);
-    client_.setConfig(optConfig);
-    client_.setParameter();
-  }
-
-  void setConfigTest();
-  void setStatusTest();
-  void sendParameterTest();
-  void sendDataTest(SendDataType type, size_t size);
-  void operationTest();
-  void mergeBlockSegmentTest();
-  void checkSegments(const BlockSegments& expected, const BlockSegments& segs);
-  void waitPassFinishTest();
-  void synchronizeTest();
-
- protected:
-  ParameterClient2 client_;
-  vector<ParameterConfig> clientConfigs_;
-  vector<ParameterPtr> parameters_;
-};
-
-std::unique_ptr<ParameterServer2Tester> g_server;
-
-void ParameterServer2Tester::setConfigTest() {
-  setup();
-
-  for (auto& config : clientConfigs_) {
-    auto it = configMap_.find(config.para_id());
-    EXPECT_TRUE(it != configMap_.end());
-    auto& serverConfig = it->second;
-    EXPECT_EQ(config.name(), serverConfig.name());
-    EXPECT_EQ(config.size(), serverConfig.size());
-    EXPECT_EQ(config.learning_rate(), serverConfig.learning_rate());
-    EXPECT_EQ(config.momentum(), serverConfig.momentum());
-  }
-}
-
-void ParameterServer2Tester::setStatusTest() {
-  setup();
-  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_NOT_SET));
-  client_.setStatus(PSERVER_STATUS_PARAMETER_READY);
-  EXPECT_EQ(PSERVER_STATUS_PARAMETER_READY, status_);
-  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_PARAMETER_READY));
-}
-
-real sumVector(const CpuVector& vec) {
-  const real* data = vec.getData();
-  size_t dim = vec.getSize();
-  real sum = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    sum += data[i];
-  }
-  return sum;
-}
-
-void ParameterServer2Tester::sendParameterTest() {
-  setup();
-
-  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
-                                  PARAMETER_VALUE,
-                                  0,       // numSamples = 0
-                                  0,       // cost = 0
-                                  false);  // sendBackParameter = false
-
-  vector<ParameterPtr> parameterCopies;
-
-  for (auto& parameter : parameters_) {
-    parameterCopies.emplace_back(
-        new Parameter(parameter->getConfig(), /* useGpu= */ false));
-    parameterCopies.back()
-        ->getBuf(PARAMETER_VALUE)
-        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-  }
-
-  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
-                                  PARAMETER_VALUE,
-                                  0,      // numSamples = 0
-                                  0,      // cost = 0
-                                  true);  // sendBackParameter = true
-
-  for (size_t i = 0; i != parameters_.size(); ++i) {
-    real* v1 = parameters_[i]->getBuf(PARAMETER_VALUE)->getData();
-    real* v2 = parameterCopies[i]->getBuf(PARAMETER_VALUE)->getData();
-    EXPECT_EQ(parameters_[i]->getSize(), parameterCopies[i]->getSize());
-    size_t size = parameters_[i]->getSize();
-    real sum1 = 0, sum2 = 0;
-    for (size_t j = 0; j < size; ++j) {
-      sum1 += v1[j];
-      sum2 += v2[j];
-    }
-    EXPECT_EQ(sum1, sum2);
-  }
-}
-
-void ParameterServer2Tester::sendDataTest(SendDataType type, size_t size) {
-  ParameterClient2 client1(true);
-  client1.init(parameters_);
-  ParameterClient2 client2(true);
-  client2.init(parameters_);
-  ParameterClient2 client3(true);
-  client3.init(parameters_);
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-  ThreadWorker worker3;
-
-  double* testData1 = new double[size];
-  double* testData2 = new double[size];
-  double* testData3 = new double[size];
-  double* getDataExpect = new double[size];
-  double* getDataReal = new double[size];
-  for (size_t i = 0; i < size; ++i) {
-    testData1[i] = rand();  // NOLINT TODO(yuyang18): Use rand_r instead.
-    testData2[i] = rand();  // NOLINT
-    testData3[i] = rand();  // NOLINT
-    getDataExpect[i] = testData1[i] + testData2[i] + testData3[i];
-  }
-
-  auto put1 = [&]() {
-    LOG(INFO) << "putOwnData1 start";
-    client1.putOwnData(0, type, testData1, size);
-    LOG(INFO) << "putOwnData1 finish";
-  };
-
-  auto get1 = [&]() {
-    LOG(INFO) << "sendData1 get all start";
-    client1.getAllData(0, type, getDataReal, size);
-    for (size_t i = 0; i < size; ++i) {
-      CHECK_EQ(getDataReal[i], getDataExpect[i]);
-    }
-    LOG(INFO) << "sendData1 get all finish";
-  };
-
-  auto put2 = [&]() {
-    LOG(INFO) << "putOwnData2 start";
-    client2.putOwnData(1, type, testData2, size);
-    LOG(INFO) << "putOwnData2 finish";
-  };
-
-  auto put3 = [&]() {
-    LOG(INFO) << "putOwnData3 start";
-    client3.putOwnData(2, type, testData3, size);
-    LOG(INFO) << "putOwnData3 finish";
-  };
-
-  worker1.addJob(put1);
-  worker1.addJob(get1);
-  worker2.addJob(put2);
-  worker3.addJob(put3);
-
-  worker1.addJob(put1);
-  worker2.addJob(put2);
-  worker3.addJob(put3);
-  worker1.addJob(get1);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-  free(testData1);
-  free(testData2);
-  free(testData3);
-  free(getDataExpect);
-  free(getDataReal);
-}
-
-void ParameterServer2Tester::operationTest() {
-  PServerVector v1, v2;
-  v1 = client_.createVector();
-  EXPECT_EQ(NUM_PARAMETER_TYPES, v1.handle);
-
-  v2 = client_.createVector();
-  EXPECT_EQ(NUM_PARAMETER_TYPES + 1, v2.handle);
-
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_RESET, v1, (real)1);
-  ops.addOperation(PSERVER_OP_RESET, v2, (real)2);
-
-  real res1, res2, res3;
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res1);
-
-  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res2);
-
-  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res3);
-  client_.doOperation(ops, false, false);
-
-  EXPECT_EQ(30000, res1);
-  EXPECT_EQ(15000, res2);
-  EXPECT_EQ(0, res3);
-
-  PServerMatrix m1, m2;
-  m1 = client_.createMatrix(4);
-  EXPECT_EQ(0, m1.handle);
-  m2 = client_.createMatrix(8);
-  EXPECT_EQ(1, m2.handle);
-
-  // TODO(yuyang18): add tests for other operations OP_COPY, OP_au
-
-  client_.releaseVector(v1);
-  client_.releaseVector(v2);
-  client_.releaseMatrix(m1);
-  client_.releaseMatrix(m2);
-}
-
-void ParameterServer2Tester::checkSegments(const BlockSegments& expected,
-                                           const BlockSegments& segs) {
-  EXPECT_EQ(expected.size(), segs.size());
-  if (expected.size() != segs.size()) {
-    return;
-  }
-  for (size_t i = 0; i < expected.size(); ++i) {
-    EXPECT_EQ(expected[i], segs[i]);
-  }
-}
-
-void ParameterServer2Tester::mergeBlockSegmentTest() {
-  {
-    BlockSegments segs{{10, 20}, {30, 45}, {50, 70}};
-    mergeSegments(&segs);
-    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 20}};
-    mergeSegments(&segs);
-    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 30}};
-    mergeSegments(&segs);
-    checkSegments({{10, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {10, 70}, {10, 30}};
-    mergeSegments(&segs);
-    checkSegments({{10, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 35}};
-    mergeSegments(&segs);
-    checkSegments({{10, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 60}};
-    mergeSegments(&segs);
-    checkSegments({{10, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {30, 47}};
-    mergeSegments(&segs);
-    checkSegments({{30, 47}, {50, 70}}, segs);
-  }
-}
-
-void ParameterServer2Tester::waitPassFinishTest() {
-  ParameterClient2 client1;
-  ParameterClient2 client2;
-  ParameterClient2 client3;
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-  ThreadWorker worker3;
-
-  auto init1 = [&]() {
-    LOG(INFO) << "init1 start";
-    client1.init(parameters_);
-    LOG(INFO) << "init1 finish";
-  };
-
-  auto init2 = [&]() {
-    LOG(INFO) << "init2 start";
-    client2.init(parameters_);
-    LOG(INFO) << "init2 finish";
-  };
-
-  auto init3 = [&]() {
-    LOG(INFO) << "init3 start";
-    client3.init(parameters_);
-    LOG(INFO) << "init3 finish";
-  };
-
-  auto update1 = [&]() {
-    LOG(INFO) << "update1 start";
-    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update1 finish";
-  };
-
-  auto wait1 = [&]() {
-    LOG(INFO) << "wait1 start";
-    client1.waitPassFinish();
-    LOG(INFO) << "wait1 finish";
-  };
-
-  auto update2 = [&]() {
-    LOG(INFO) << "update2 start";
-    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update2 finish";
-  };
-
-  auto wait2 = [&]() {
-    LOG(INFO) << "wait2 start";
-    client2.waitPassFinish();
-    LOG(INFO) << "wait2 finish";
-  };
-
-  auto op3 = [&]() {
-    LOG(INFO) << "op3 start";
-    PreparedOperations ops;
-    ops.addOperation(PSERVER_OP_SGD);
-    client3.doOperation(ops,
-                        /* waitForGradient= */ true,
-                        /* sendBackarameter= */ true);
-    LOG(INFO) << "op3 finish";
-  };
-
-  worker1.addJob(init1);
-  worker2.addJob(init2);
-  worker3.addJob(init3);
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker3.addJob(op3);
-
-  worker3.addJob(op3);
-  worker3.addJob(op3);
-  worker2.addJob(update2);
-  worker2.addJob(update2);
-  worker1.addJob(wait1);
-
-  worker2.addJob(wait2);
-  worker3.addJob(op3);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  LOG(INFO) << "Pass 1 finished";
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker3.addJob(op3);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  worker3.addJob(op3);
-  worker3.addJob(op3);
-  worker1.addJob(update1);
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  LOG(INFO) << "Pass 2 finished";
-}
-
-void ParameterServer2Tester::synchronizeTest() {
-  ParameterClient2 client1;
-  ParameterClient2 client2;
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-
-  FLAGS_log_period_server = 2;
-
-  auto init1 = [&]() {
-    LOG(INFO) << "init1 start";
-    client1.init(parameters_);
-    client1.setTrainerId(0);
-    LOG(INFO) << "init1 finish";
-  };
-
-  auto init2 = [&]() {
-    LOG(INFO) << "init2 start";
-    client2.init(parameters_);
-    client2.setTrainerId(1);
-    LOG(INFO) << "init2 finish";
-  };
-
-  auto update1 = [&]() {
-    LOG(INFO) << "update1 start";
-    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update1 finish";
-  };
-
-  auto wait1 = [&]() {
-    LOG(INFO) << "wait1 start";
-    client1.asyncFinishPass();
-    LOG(INFO) << "wait1 finish";
-  };
-
-  auto update2 = [&]() {
-    LOG(INFO) << "update2 start";
-    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update2 finish";
-  };
-
-  auto wait2 = [&]() {
-    LOG(INFO) << "wait2 start";
-    client2.asyncFinishPass();
-    LOG(INFO) << "wait2 finish";
-  };
-
-  worker1.addJob(init1);
-  worker2.addJob(init2);
-  // call wait to reset some stats at pserver
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-
-  worker2.addJob(update2);
-  worker2.addJob(update2);
-  worker1.addJob(wait1);
-
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  LOG(INFO) << "Pass 1 finished";
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-
-  worker1.wait();
-  worker2.wait();
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  LOG(INFO) << "Pass 2 finished";
-}
-
-TEST(ParameterServer2, sendParameter) { g_server->sendParameterTest(); }
-
-TEST(ParameterServer2, setConfig) { g_server->setConfigTest(); }
-
-TEST(ParameterServer2, setStatus) { g_server->setStatusTest(); }
-
-TEST(ParameterServer2, operation) { g_server->operationTest(); }
-
-TEST(ParameterServer2, mergeBlockSegment) { g_server->mergeBlockSegmentTest(); }
-
-TEST(ParameterServer2, waitPassFinish) { g_server->waitPassFinishTest(); }
-
-TEST(ParameterServer2, synchronize) { g_server->synchronizeTest(); }
-
-TEST(ParameterServer2, sendData) {
-  // Set gserver and pserver all 3, so that the test is sufficient.
-  int oldFlagsPortsNUm = FLAGS_ports_num;
-  int oldFlagsNumGradientServers = FLAGS_num_gradient_servers;
-  int oldFlagsPort = FLAGS_port;
-  FLAGS_ports_num = 3;
-  FLAGS_num_gradient_servers = 3;
-  FLAGS_port = FLAGS_port + 1;
-  std::unique_ptr<ParameterServer2Tester> g_server1;
-  std::unique_ptr<ParameterServer2Tester> g_server2;
-  std::unique_ptr<ParameterServer2Tester> g_server3;
-  if (FLAGS_rdma_tcp == "rdma") {
-    g_server1.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
-    g_server1->start();
-    g_server2.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port + 1, FLAGS_server_cpu + 1));
-    g_server2->start();
-    g_server3.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port + 2, FLAGS_server_cpu + 2));
-    g_server3->start();
-  } else {  // tcp
-    g_server1.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
-    g_server1->start();
-    g_server2.reset(
-        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 1));
-    g_server2->start();
-    g_server3.reset(
-        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 2));
-    g_server3->start();
-  }
-
-  g_server2->init();
-  g_server3->init();
-  sleep(2);
-  g_server1->setup();
-  g_server1->sendDataTest(DATA_REDUCE_SUM, 1 << 24);
-  sleep(2);
-  g_server1->sendDataTest(DATA_REDUCE_SUM, 2);
-  sleep(2);
-  g_server1.reset();
-  g_server2.reset();
-  g_server3.reset();
-
-  FLAGS_ports_num = oldFlagsPortsNUm;
-  FLAGS_num_gradient_servers = oldFlagsNumGradientServers;
-  FLAGS_port = oldFlagsPort;
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-
-  FLAGS_num_gradient_servers = 2;
-
-  if (FLAGS_rdma_tcp == "rdma") {
-    g_server.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
-  } else {
-    g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
-  }
-
-  g_server->start();
-
-  sleep(2);
-
-  int ret = RUN_ALL_TESTS();
-
-  g_server.reset();
-
-  exit(ret);
-}
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
deleted file mode 100644
index a66b14a1cc58d11988e4936a9c35d98b8bf5edc1..0000000000000000000000000000000000000000
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <memory>
-#include "ParameterService.pb.h"
-#include "paddle/math/Vector.h"
-#include "paddle/pserver/ProtoServer.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_string(server_addr, "127.0.0.1", "Server address");
-DEFINE_int64(dim, 50000000, "Data size");
-DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
-DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
-
-using namespace paddle;  // NOLINT
-
-class MyServer : public ProtoServer {
- public:
-  explicit MyServer(int port, int rdmaCpu = -1)
-      : ProtoServer(FLAGS_server_addr, port, rdmaCpu),
-        status_(PSERVER_STATUS_NOT_SET) {
-    REGISTER_SERVICE_FUNCTION(MyServer, getStatus);
-    REGISTER_SERVICE_FUNCTION(MyServer, setStatus);
-    REGISTER_SERVICE_FUNCTION_EX(MyServer, getStatusEx);
-  }
-  void getStatus(const GetStatusRequest& request,
-                 ProtoResponseCallback callback) {
-    (void)request;
-    GetStatusResponse response;
-    response.set_status(status_);
-    callback(response);
-  }
-
-  void getStatusEx(const GetStatusRequest& request,
-                   std::unique_ptr<MsgReader> msgReader,
-                   ProtoResponseCallbackEx callback) {
-    (void)request;
-    GetStatusResponse response;
-    response.set_status(status_);
-    buffer_.resize(msgReader->getNextBlockLength());
-    msgReader->readNextBlock(&buffer_[0]);
-    callback(response, {{&buffer_[0], buffer_.size()}});
-  }
-
-  void setStatus(const SetStatusRequest& request,
-                 ProtoResponseCallback callback) {
-    SetStatusResponse response;
-    status_ = request.status();
-    callback(response);
-  }
-
- protected:
-  PServerStatus status_;
-  std::string buffer_;
-};
-
-TEST(ProtoServer, regular) {
-  ProtoClient* client;
-  if (FLAGS_rdma_tcp == "rdma")
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
-  else
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
-  {
-    GetStatusRequest request;
-    GetStatusResponse response;
-    auto msgReader = client->sendAndRecv("getStatus", request, &response);
-    EXPECT_EQ(response.status(), PSERVER_STATUS_NOT_SET);
-    EXPECT_EQ(msgReader->getNumBlocks(), (size_t)0);
-  }
-
-  {
-    SetStatusRequest request;
-    SetStatusResponse response;
-    request.set_status(PSERVER_STATUS_PARAMETER_READY);
-    client->sendAndRecv("setStatus", request, &response);
-  }
-
-  {
-    GetStatusRequest request;
-    GetStatusResponse response;
-    client->sendAndRecv("getStatus", request, &response);
-    EXPECT_EQ(response.status(), PSERVER_STATUS_PARAMETER_READY);
-  }
-
-  delete client;
-}
-
-TEST(ProtoServer, extended) {
-#ifdef PADDLE_WITH_CUDA
-  ProtoClient* client;
-  if (FLAGS_rdma_tcp == "rdma")
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
-  else
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
-  int64_t dataSize = FLAGS_dim * sizeof(real);
-
-  GpuVector gpuParam(FLAGS_dim);
-  GpuVector gpuGrad(FLAGS_dim);
-  CpuVector cpuParam(FLAGS_dim);
-  CpuVector cpuGrad(FLAGS_dim);
-
-  gpuParam.rand();
-  gpuGrad.rand();
-  cpuParam.rand();
-  cpuGrad.rand();
-
-  for (int k = 0; k < 4; ++k) {
-    for (int i = 0; i < 10; ++i) {
-      cpuGrad.copyFrom(gpuGrad);
-      if (FLAGS_test_proto_server) {
-        GetStatusRequest request;
-        GetStatusResponse response;
-        {
-          REGISTER_TIMER("sendAndRecv");
-          auto msgReader =
-              client->sendAndRecv("getStatusEx",
-                                  request,
-                                  {{cpuGrad.getData(), (size_t)dataSize}},
-                                  &response);
-
-          EXPECT_EQ(msgReader->getNumBlocks(), (size_t)1);
-          EXPECT_EQ(msgReader->getNextBlockLength(), (size_t)dataSize);
-          msgReader->readNextBlock(cpuParam.getData());
-        }
-        if (!FLAGS_benchmark) {
-          real* v1 = cpuGrad.getData();
-          real* v2 = cpuParam.getData();
-          real sum1 = 0, sum2 = 0;
-          for (int j = 0; j < FLAGS_dim; ++j) {
-            sum1 += v1[j];
-            sum2 += v2[j];
-          }
-          EXPECT_EQ(sum1, sum2);
-        }
-      }
-      gpuParam.copyFrom(cpuParam);
-
-      LOG_EVERY_N(INFO, 10) << "i=" << i;
-    }
-    globalStat.printAllStatus();
-    globalStat.reset();
-  }
-
-  delete client;
-#endif
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  MyServer server(FLAGS_port, FLAGS_rdma_tcp == "rdma" ? 0 : -1);
-  server.start();
-  usleep(10000);
-
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/pserver/test/test_ProtoServer.sh b/paddle/pserver/test/test_ProtoServer.sh
deleted file mode 100755
index 970c90b494c2a256cf22f3de7b7ea7964fed58ab..0000000000000000000000000000000000000000
--- a/paddle/pserver/test/test_ProtoServer.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -x
-for ((port=12340;port<=12360;port++))
-do
-    port_used_num=`netstat -a |grep $port|wc -l`
-    if [ $port_used_num -eq 0 ]
-    then
-        echo $port;
-        pserver/test/test_ProtoServer --port=$port 
-        if [ $? -eq 0 ]
-           then
-               exit 0
-           else
-               echo "test_ProtoServer run wrong"
-       	       exit 1
-        fi
-fi
-done
-echo "test_ProtoServer port not found"
-exit 1
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index b66a05aaebda645196721fd6ed840e5584813348..d8f0b76b7ba0fedfe411aa86f6f8a0c77a02beca 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -106,7 +106,7 @@ function cmake_gen() {
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-        -DWITH_ANAKIN=${WITH_ANAKIN:-ON}
+        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
         -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
     ========================================
 EOF
@@ -135,7 +135,7 @@ EOF
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-        -DWITH_ANAKIN=${WITH_ANAKIN:-ON} \
+        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
         -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
 }
 
diff --git a/paddle/testing/TestUtil.cpp b/paddle/testing/TestUtil.cpp
index cfb8c713d96008a74287fb1248657c30f3b81164..fa8efc20f59addb4526d2cbeaf34f161307c588a 100644
--- a/paddle/testing/TestUtil.cpp
+++ b/paddle/testing/TestUtil.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "TestUtil.h"
 #include <gflags/gflags.h>
-#include "paddle/math/SparseMatrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
 
 DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
 
diff --git a/paddle/testing/TestUtil.h b/paddle/testing/TestUtil.h
index ec86469aebbafbf5406a21e6825eda6c105a6b9d..98b864e3c56f1938075bd039ba13a49ec457de50 100644
--- a/paddle/testing/TestUtil.h
+++ b/paddle/testing/TestUtil.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
+#include "paddle/legacy/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
index 56c38015fb2398f8b39fac6b5a5d4af1c2fd56aa..6624d6d27bf41605ad8486ad1297558996c81978 100644
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "ParamUtil.h"
 #include "Trainer.h"
-#include "paddle/pserver/ParameterServer2.h"
+#include "paddle/legacy/pserver/ParameterServer2.h"
 #include "paddle/utils/PythonUtil.h"
 
 DEFINE_string(model_dir, "", "Directory for separated model files");
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
index 02693c675e6f5cb574e52e9681963a5904676028..33c1fa7bdf347d5ad2d42384da05018bc9525d94 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "OptimizerConfig.pb.h"
 #include "ParameterUpdater.h"
 #include "libpaddle_pserver_cclient.h"
-#include "paddle/pserver/ParameterClient2.h"
+#include "paddle/legacy/pserver/ParameterClient2.h"
 #include "paddle/utils/Queue.h"
 #include "paddle/utils/Util.h"
 
diff --git a/paddle/trainer/ParamUtil.cpp b/paddle/trainer/ParamUtil.cpp
index ffbca42e106591ddeb2cefcfafbeb408c544371b..b577e3e868359912814ac41144b324a221515247 100644
--- a/paddle/trainer/ParamUtil.cpp
+++ b/paddle/trainer/ParamUtil.cpp
@@ -31,8 +31,8 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 
 #include "TesterConfig.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/ValidationLayer.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/ParamUtil.h b/paddle/trainer/ParamUtil.h
index 10746b4d58e3a82c081987a6aaad9e0b42272a03..c34e079b90be763f8530ba3213b78f80d726e8f0 100644
--- a/paddle/trainer/ParamUtil.h
+++ b/paddle/trainer/ParamUtil.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <stdio.h>
 
 #include "hl_gpu.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
 
 #include <stdlib.h>
 #include <fstream>
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
index ef7ab92eca77bab2a8481561713f8034d2b8505d..0070254d1c56ff26596a7585de7833df52107acf 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
@@ -17,15 +17,15 @@ limitations under the License. */
 #include "paddle/utils/Thread.h"
 #include "paddle/utils/Util.h"
 
-#include "paddle/parameter/AverageOptimizer.h"
-#include "paddle/parameter/FirstOrderOptimizer.h"
-#include "paddle/parameter/OptimizerFunctions.h"
-#include "paddle/parameter/OptimizerWithRegularizer.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterUpdaterBase.h"
+#include "paddle/legacy/parameter/AverageOptimizer.h"
+#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
+#include "paddle/legacy/parameter/OptimizerFunctions.h"
+#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
 
 #include "TrainerConfig.pb.h"
-#include "paddle/gserver/layers/Layer.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
 
 #include <memory>
 #include <vector>
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
index 3a40a46354efd6b92278884c8f5b72504a3ff283..7a9b687ac2ee078aa21beb5a2c20d42db4fdb429 100644
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <functional>
 #include <thread>
 #include "ParameterUpdater.h"
-#include "paddle/pserver/ParameterClient2.h"
+#include "paddle/legacy/pserver/ParameterClient2.h"
 #include "paddle/utils/Queue.h"
 #include "paddle/utils/Util.h"
 
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
index 16e676d60248dfe6d443c50fbf34970e63c1f412..f7daf1327b470233a6918a6807ea061f6f170f98 100644
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -30,9 +30,9 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 
 #include "TesterConfig.h"
-#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachineMode.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/ValidationLayer.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/Tester.h b/paddle/trainer/Tester.h
index 801c77e3116369732bf4b03107adce6a71dc2184..bce9775a098da92db4edecdd47e29887abf84fde 100644
--- a/paddle/trainer/Tester.h
+++ b/paddle/trainer/Tester.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <stdio.h>
 
 #include "hl_gpu.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
 
 #include "TrainerConfig.pb.h"
 
diff --git a/paddle/trainer/TesterConfig.h b/paddle/trainer/TesterConfig.h
index 68d4c931ff2df8e24acaa9fe6b35bfd613197c72..ef10c7dbf7346bef3006cb433ac86a6ea7786946 100644
--- a/paddle/trainer/TesterConfig.h
+++ b/paddle/trainer/TesterConfig.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <stdio.h>
 
 #include "hl_gpu.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
 
 #include "TrainerConfig.pb.h"
 
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
index 3c85c3aaac68fc29da90c24d1208887a17009d5f..39e63c333e272da4f3b11d917e23063088189532 100644
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "paddle/utils/Logging.h"
 
-#include "paddle/math/SparseRowMatrix.h"
-#include "paddle/parameter/ThreadLocalBuffer.h"
+#include "paddle/legacy/math/SparseRowMatrix.h"
+#include "paddle/legacy/parameter/ThreadLocalBuffer.h"
 #include "paddle/utils/Thread.h"
 
 DECLARE_int32(trainer_count);
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
index b5e6a7ce3c8457364b10c921bca3386fbb6f6cbf..bd0ce990783d4fc57b03090397f4291be3fdabda 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/parameter/AverageOptimizer.h"
-#include "paddle/parameter/FirstOrderOptimizer.h"
-#include "paddle/parameter/OptimizerFunctions.h"
-#include "paddle/parameter/OptimizerWithRegularizer.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/Regularizer.h"
+#include "paddle/legacy/parameter/AverageOptimizer.h"
+#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
+#include "paddle/legacy/parameter/OptimizerFunctions.h"
+#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/Regularizer.h"
 #include "paddle/utils/Util.h"
 
 #include <memory>
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 3e4a2b5fa8a3981f6362edc1dc61ae1616e257ef..edfd72197e642a76ebc3db90a406284da6cd54bc 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -33,9 +33,9 @@ limitations under the License. */
 #include "TesterConfig.h"
 #include "ThreadParameterUpdater.h"
 #include "TrainerConfigHelper.h"
-#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachineMode.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/ValidationLayer.h"
 
 DEFINE_string(config, "", "Trainer config file");
 
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index 78127b7be5cef34f51a4b540852c139625b571dd..58acec17818a80840b4c0b61641331d8d91d7ea8 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <stdio.h>
 
 #include "hl_gpu.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
 
 #include <stdlib.h>
 #include <fstream>
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp
index 4c5d4a0913aaf3a9932b3d67806378ece4245304..b4b1a87cd5b1e8013cec9c1d8f9ce143e03209c2 100644
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
@@ -24,8 +24,8 @@ limitations under the License. */
 
 #include <google/protobuf/text_format.h>
 
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/ValidationLayer.h"
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
diff --git a/paddle/trainer/TrainerInternal.h b/paddle/trainer/TrainerInternal.h
index 48ee53a5e60f950bfc3cc299c754b0e72601c818..ecc87966dc884065d4736920102941e52a734d88 100644
--- a/paddle/trainer/TrainerInternal.h
+++ b/paddle/trainer/TrainerInternal.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "TrainerConfigHelper.h"
 #include "TrainerInternalConfig.h"
 #include "hl_gpu.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerInternalConfig.h b/paddle/trainer/TrainerInternalConfig.h
index 43aae381029784278ad58c9398f64af24dffa1df..29d588e1be18f1fd86eeaadf8c19074ce649470d 100644
--- a/paddle/trainer/TrainerInternalConfig.h
+++ b/paddle/trainer/TrainerInternalConfig.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <stdio.h>
 
 #include "hl_gpu.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
 
 #include "TrainerConfig.pb.h"
 
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index c5c1d484e5f85c774fd4b8f1d4a8d46abfa2f547..115e5d88a24b5360f2633c4bb5b73fb531e6641d 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fenv.h>
-#include "paddle/pserver/ParameterServerController.h"
+#include "paddle/legacy/pserver/ParameterServerController.h"
 #include "paddle/utils/PythonUtil.h"
 
 #include "ParamUtil.h"
diff --git a/paddle/trainer/tests/config_parser_test.py b/paddle/trainer/tests/config_parser_test.py
index db66ebb5b7c13fe53df14a07918aad62ba895ffa..88646e11f7610846558fb7bfb02c1dafddc68fea 100644
--- a/paddle/trainer/tests/config_parser_test.py
+++ b/paddle/trainer/tests/config_parser_test.py
@@ -19,4 +19,5 @@ if __name__ == '__main__':
     parse_config_and_serialize(
         'trainer/tests/sample_trainer_config.conf',
         'extension_module_name=paddle.trainer.config_parser_extension')
-    parse_config_and_serialize('gserver/tests/pyDataProvider/trainer.conf', '')
+    parse_config_and_serialize(
+        'legacy/gserver/tests/pyDataProvider/trainer.conf', '')
diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
index 92dc8aa9ec5ce281d1950d84260c1b9555e686a7..e3cd1c904d50af2efb7c5d6fa23a99cf8c5b5c85 100644
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -15,9 +15,9 @@ limitations under the License. */
 #ifndef PADDLE_NO_PYTHON
 #include <DataConfig.pb.h>
 #include <gtest/gtest.h>
-#include <paddle/gserver/dataproviders/DataProvider.h>
-#include <paddle/math/Matrix.h>
-#include <paddle/parameter/Argument.h>
+#include <paddle/legacy/gserver/dataproviders/DataProvider.h>
+#include <paddle/legacy/math/Matrix.h>
+#include <paddle/legacy/parameter/Argument.h>
 #include <paddle/utils/PythonUtil.h>
 #include <fstream>
 #include <typeinfo>
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index de12c4d649c6041f497c0eeac0904ebfc0d5bf97..1e1b2d2bf46b8664892e3b0f96e7e56989ef7e25 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/trainer/TrainerInternal.h"
 
 #include <gtest/gtest.h>
-#include <paddle/pserver/ParameterServer2.h>
+#include <paddle/legacy/pserver/ParameterServer2.h>
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 45af83708ea63fc1b6aa86f1e8423bb44b7388a6..3034c1a0875a71421bcba172c16ee32d809df152 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -118,7 +118,8 @@ def __bootstrap__():
 
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb'
+        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
+        'init_allocated_mem'
     ]
     if core.is_compiled_with_cuda():
         read_env_flags += [
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 145f1423e4b4a2ce35ba8ac3cca37935df90727e..b436dfe70afdb52299222f8ba3f5bdff2842d103 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -348,6 +348,12 @@ class Executor(object):
         ]
         return outs
 
+    def begin_pass(self):
+        self.executor.begin_pass()
+
+    def end_pass(self):
+        self.executor.end_pass()
+
     def run(self,
             program=None,
             feed=None,
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index d94564e11f982575dd9c065deb20d29396203227..5c8f4f6507c7dd9b3d005639d962ce1e55b2c704 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -18,7 +18,7 @@ import time
 import shutil
 
 from paddle.fluid.evaluator import Evaluator
-from paddle.fluid.framework import Program, Parameter, default_main_program, Variable
+from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable
 from . import core
 
 __all__ = [
@@ -1374,3 +1374,101 @@ def get_latest_checkpoint_serial(checkpoint_dir):
         if success_num > current_dir:
             current_dir = success_num
     return current_dir
+
+
+def get_test_program(filelist, program=None, startup_program=None):
+    """
+    Transpile current train program to a program to read test dataset
+    if the program is using reader ops like "open_files_op".
+    """
+
+    def _copy_reader_var_(block, var, new_name=None):
+        if new_name == None:
+            new_name = var.name
+        new_var = block.create_var(
+            name=str(new_name), type=core.VarDesc.VarType.READER)
+        new_var.desc.set_shapes(var.desc.shapes())
+        new_var.desc.set_dtypes(var.desc.dtypes())
+        new_var.persistable = True
+        return new_var
+
+    def _get_test_reader_name(train_reader_name):
+        return train_reader_name + "_test"
+
+    def _is_reader_op(op):
+        block = op.block
+        if "Out" in op.output_names:
+            reader_out = block.vars[op.output("Out")[0]]
+            if reader_out.type == core.VarDesc.VarType.READER:
+                return True
+        return False
+
+    if program == None:
+        program = default_main_program()
+    if startup_program == None:
+        startup_program = default_startup_program()
+    startup_block = startup_program.global_block()
+
+    # 1. find out the orignal reader var name
+    startup_reader_op_list = []
+
+    for op in startup_block.ops:
+        if _is_reader_op(op):
+            startup_reader_op_list.append(op)
+
+    if len(startup_reader_op_list) == 0:
+        return program
+
+    root_reader_op = startup_reader_op_list[0]
+    train_test_reader_map = {}
+    # 2. add operators to startup to read open and read test data files
+    for op in startup_reader_op_list:
+        assert (len(op.output("Out")) == 1)
+        train_reader_name = op.output("Out")[0]
+        train_reader = startup_block.vars[train_reader_name]
+        test_reader = _copy_reader_var_(
+            startup_block,
+            train_reader,
+            new_name=_get_test_reader_name(train_reader_name))
+        train_test_reader_map[train_reader.name] = test_reader
+
+        test_op_inputs = {}
+        for name in op.input_names:
+            train_arg_names = op.input(name)
+            test_arg_vars = []
+            for arg_name in train_arg_names:
+                arg_var = train_test_reader_map[
+                    arg_name] if name == "UnderlyingReader" else startup_block.vars[
+                        arg_name]
+                test_arg_vars.append(arg_var)
+            test_op_inputs[name] = test_arg_vars
+
+        test_op = startup_block.append_op(
+            type=op.type,
+            inputs=test_op_inputs,
+            outputs={'Out': [test_reader]},
+            attrs=op.attrs)
+        # root reader op's filelist attr for read test files
+        if op.type == root_reader_op.type:
+            test_op.set_attr("file_names", filelist)
+        if op.type == "create_multi_pass_reader":
+            test_op.set_attr("pass_num", 1)
+
+    # 3. rename reader vars in inference program to different name
+    #    to avoid read from train data.
+    main_block = program.global_block()
+    for var in main_block.vars.values():
+        if var.type == core.VarDesc.VarType.READER:
+            main_block.rename_var(
+                str(var.name), str(_get_test_reader_name(var.name)))
+
+    for op in main_block.ops:
+        if op.type == root_reader_op.type:
+            test_op.set_attr("file_names", filelist)
+        if op.type == "create_multi_pass_reader":
+            test_op.set_attr("pass_num", 1)
+
+    startup_program.sync_with_cpp()
+    program.sync_with_cpp()
+
+    return program
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 200db87f1793a41e8327b59677252c19eab567de..6af01297df54ffd4201776d20d51a88f5808ccb0 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -30,6 +30,7 @@ __all__ = [
     'detection_output',
     'ssd_loss',
     'detection_map',
+    'anchor_generator',
 ]
 
 __auto__ = [
@@ -998,3 +999,95 @@ def multi_box_head(inputs,
     box.stop_gradient = True
     var.stop_gradient = True
     return mbox_locs_concat, mbox_confs_concat, box, var
+
+
+def anchor_generator(input,
+                     anchor_sizes=None,
+                     aspect_ratios=None,
+                     variance=[0.1, 0.1, 0.2, 0.2],
+                     stride=None,
+                     offset=0.5,
+                     name=None):
+    """
+    **Anchor generator operator**
+
+    Generate anchors for Faster RCNN algorithm.
+    Each position of the input produce N anchors, N =
+    size(anchor_sizes) * size(aspect_ratios). The order of generated anchors
+    is firstly aspect_ratios loop then anchor_sizes loop.
+
+    Args:
+       input(Variable): The input feature map, the format is NCHW.
+       anchor_sizes(list|tuple|float): The anchor sizes of generated anchors,
+       given in absolute pixels e.g. [64., 128., 256., 512.].
+       For instance, the anchor size of 64 means the area of this anchor equals to 64**2.
+       aspect_ratios(list|tuple|float): The height / width ratios of generated
+            anchors, e.g. [0.5, 1.0, 2.0].
+       variance(list|tuple): The variances to be used in box regression deltas.
+            Default:[0.1, 0.1, 0.2, 0.2].
+       stride(list|turple): The anchors stride across width and height,
+            e.g. [16.0, 16.0]
+       offset(float): Prior boxes center offset. Default: 0.5
+       name(str): Name of the prior box op. Default: None.
+
+    Returns:
+        Anchors(Variable):  The output anchors with a layout of [H, W, num_anchors, 4].
+              H is the height of input, W is the width of input,
+              num_anchors is the box count of each position.
+              Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
+        Variances(Variable): The expanded variances of anchors
+              with a layout of [H, W, num_priors, 4].
+              H is the height of input, W is the width of input
+              num_anchors is the box count of each position.
+              Each variance is in (xcenter, ycenter, w, h) format.
+
+
+    Examples:
+
+        .. code-block:: python
+
+            anchor, var = anchor_generator(
+                input=conv1,
+                anchor_sizes=[64, 128, 256, 512],
+                aspect_ratios=[0.5, 1.0, 2.0],
+                variance=[0.1, 0.1, 0.2, 0.2],
+                stride=[16.0, 16.0],
+                offset=0.5)
+    """
+    helper = LayerHelper("anchor_generator", **locals())
+    dtype = helper.input_dtype()
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(anchor_sizes):
+        anchor_sizes = [anchor_sizes]
+    if not _is_list_or_tuple_(aspect_ratios):
+        aspect_ratios = [aspect_ratios]
+    if not (_is_list_or_tuple_(stride) and len(stride) == 2):
+        raise ValueError('stride should be a list or tuple ',
+                         'with length 2, (stride_width, stride_height).')
+
+    anchor_sizes = list(map(float, anchor_sizes))
+    aspect_ratios = list(map(float, aspect_ratios))
+    stride = list(map(float, stride))
+
+    attrs = {
+        'anchor_sizes': anchor_sizes,
+        'aspect_ratios': aspect_ratios,
+        'variances': variance,
+        'stride': stride,
+        'offset': offset
+    }
+
+    anchor = helper.create_tmp_variable(dtype)
+    var = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="anchor_generator",
+        inputs={"Input": input},
+        outputs={"Anchors": anchor,
+                 "Variances": var},
+        attrs=attrs, )
+    anchor.stop_gradient = True
+    var.stop_gradient = True
+    return anchor, var
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 61c01b3b0056648b6cc67430d5d3bfffc8412928..bcf520d5a4e3bbe1d949d08f42199dd8c5cdc947 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -95,7 +95,6 @@ __all__ = [
     'relu',
     'log',
     'crop',
-    'fill_zeros_like',
 ]
 
 
@@ -5185,40 +5184,3 @@ def crop(x, shape=None, offsets=None, name=None):
         outputs={'Out': out},
         attrs=None if len(attrs) == 0 else attrs)
     return out
-
-
-def fill_zeros_like(x):
-    """
-    This layer takes an input and outputs a variable that has the same structure as
-    the input and with all the element values as zero. The variable can be a Tensor
-    or TensorArray.
-
-    .. code-block:: text
-
-
-       Given
-          X = [[0, 1, 2, 0],
-               [0, 3, 4, 0],
-               [0, 0, 0, 0]],
-       output is:
-          Out = [[0, 0, 0, 0],
-                 [0, 0, 0, 0],
-                 [0, 0, 0, 0]].
-
-    Args:
-        x (Variable): The input variable, which could be a tensor or tensor array
-
-    Returns:
-        Variable: The zero-filled variable, which has the same type and shape as
-                  the input variable.
-
-    Examples:
-
-        .. code-block:: python
-            y = fluid.layers.fill_zeros_like(x)
-    """
-    helper = LayerHelper('fill_zeros_like', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(
-        type='fill_zeros_like', inputs={'X': [x]}, outputs={'Out': [out]})
-    return out
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index ce5f08de623c8e4572599f8088ecae2e4821cce0..b6614ecf3bc16e73683f4991779769049c6800ed 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -33,6 +33,7 @@ __all__ = [
     'fill_constant',
     'argmin',
     'argmax',
+    'argsort',
     'ones',
     'zeros',
     'reverse',
@@ -444,6 +445,58 @@ def argmax(x, axis=0):
     return out
 
 
+def argsort(input, axis=-1, name=None):
+    """
+    Performs sorting on the input Variable along the given axis, and outputs 
+    sorted data Varibale and its corresponding index Variable with the same 
+    shape as :attr:`input`.
+
+    .. code-block:: text
+    
+        For example, the given axis is -1 and the input Variable
+
+            input = [[0.15849551, 0.45865775, 0.8563702 ],
+                     [0.12070083, 0.28766365, 0.18776911]],
+
+        after argsort, the sorted Vairable becomes
+
+            out = [[0.15849551, 0.45865775, 0.8563702 ],
+                   [0.12070083, 0.18776911, 0.28766365]],
+
+        and the sorted indices along the given axis turn outs to be
+
+            indices = [[0, 1, 2], 
+                       [0, 2, 1]]
+
+    Args:
+        input(Variable): The input Variable for sorting.
+        axis(int): The axis along which to sort the input Variable. When 
+                   :attr:`axis` < 0, the actual axis will be :attr:`axis` + 
+                   rank(:attr:`input`). Default -1, the last dimension.
+        name(str|None): (optional) A name for this layer. If set None, the 
+                   layer will be named automatically.
+
+    Returns:
+        tuple: A tuple of sorted data Variable and the sorted indices.
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(data=[2, 3])
+            out, indices = fluid.layers.argsort(input, axis=0)
+    """
+    helper = LayerHelper("argsort", **locals())
+    out = helper.create_tmp_variable(dtype=input.dtype, stop_gradient=True)
+    ids = helper.create_tmp_variable(VarDesc.VarType.INT64, stop_gradient=True)
+    helper.append_op(
+        type='argsort',
+        inputs={'X': input},
+        outputs={'Out': out,
+                 'Indices': ids},
+        attrs={'axis': axis})
+    return out, ids
+
+
 def ones(shape, dtype, force_cpu=False):
     """
     **ones**
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 74f96f456a8dc917b715d0f4308bb5ea41947f0b..71bf5f8b3a9b17f24ce35220a9348bb871852623 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -110,14 +110,23 @@ def infer(use_cuda, save_dirname=None):
         # The input's dimension should be 2-D and the second dim is 13
         # The input data should be >= 0
         batch_size = 10
-        tensor_x = numpy.random.uniform(0, 10,
-                                        [batch_size, 13]).astype("float32")
+
+        test_reader = paddle.batch(
+            paddle.dataset.uci_housing.test(), batch_size=batch_size)
+
+        test_data = test_reader().next()
+        test_feat = numpy.array(
+            [data[0] for data in test_data]).astype("float32")
+        test_label = numpy.array(
+            [data[1] for data in test_data]).astype("float32")
+
         assert feed_target_names[0] == 'x'
         results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_x},
+                          feed={feed_target_names[0]: numpy.array(test_feat)},
                           fetch_list=fetch_targets)
         print("infer shape: ", results[0].shape)
         print("infer results: ", results[0])
+        print("ground truth: ", test_label)
 
 
 def main(use_cuda, is_local=True):
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 8569d838bdd414eb84c6c87674990a25a2fdcdf9..2d70c986b1b6c42ff709e9cf3b4234cf4fc26836 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -127,6 +127,24 @@ class TestPriorBox(unittest.TestCase):
         assert box.shape[3] == 4
 
 
+class TestAnchorGenerator(unittest.TestCase):
+    def test_anchor_generator(self):
+        data_shape = [3, 224, 224]
+        images = fluid.layers.data(
+            name='pixel', shape=data_shape, dtype='float32')
+        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+        anchor, var = fluid.layers.anchor_generator(
+            input=conv1,
+            anchor_sizes=[64, 128, 256, 512],
+            aspect_ratios=[0.5, 1.0, 2.0],
+            variance=[0.1, 0.1, 0.2, 0.2],
+            stride=[16.0, 16.0],
+            offset=0.5)
+        assert len(anchor.shape) == 4
+        assert anchor.shape == var.shape
+        assert anchor.shape[3] == 4
+
+
 class TestMultiBoxHead(unittest.TestCase):
     def test_multi_box_head(self):
         data_shape = [3, 224, 224]
diff --git a/python/paddle/fluid/tests/unittests/.gitignore b/python/paddle/fluid/tests/unittests/.gitignore
index 3538a9c2009bb133609153427981fb66974377fa..b1e8fda03aa42f5f7528eafb46c16d55b868bae5 100644
--- a/python/paddle/fluid/tests/unittests/.gitignore
+++ b/python/paddle/fluid/tests/unittests/.gitignore
@@ -4,3 +4,5 @@ mnist_1.recordio
 mnist_2.recordio
 flowers.recordio
 wmt16.recordio
+data_balance_test.recordio
+data_balance_with_lod_test.recordio
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 5f27864c140573086d07415f83caca708889a068..f6c8dcabcbc592024188f4742e6c532a704d2289 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -52,3 +52,4 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
 set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
+set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 21f2037ad408b0a92718c0ea2bae5e8bf563c665..cddf00765f4894126988c794763c34629449e8e6 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -18,6 +18,8 @@ import unittest
 import paddle.fluid as fluid
 import time
 import numpy as np
+import math
+import sys
 
 __all__ = ['TestParallelExecutorBase']
 
@@ -93,6 +95,12 @@ class TestParallelExecutorBase(unittest.TestCase):
                 print "%.4f Instance per second" % (
                     (batch_size * iter + 2) / (end - begin))
 
+            avg_last_loss_val = np.array(last_loss).mean()
+            avg_first_loss_val = np.array(first_loss).mean()
+            if math.isnan(float(avg_last_loss_val)) or math.isnan(
+                    float(avg_first_loss_val)):
+                sys.exit("got NaN loss, training failed.")
+
             print first_loss, last_loss
             # self.assertGreater(first_loss[0], last_loss[0])
             return first_loss, last_loss
diff --git a/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c7d5d41f0c512a9fb609dce304c1eed929d28b5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://w_idxw.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+
+
+def anchor_generator_in_python(input_feat, anchor_sizes, aspect_ratios,
+                               variances, stride, offset):
+    num_anchors = len(aspect_ratios) * len(anchor_sizes)
+    layer_h = input_feat.shape[2]
+    layer_w = input_feat.shape[3]
+    out_dim = (layer_h, layer_w, num_anchors, 4)
+    out_anchors = np.zeros(out_dim).astype('float32')
+
+    for h_idx in range(layer_h):
+        for w_idx in range(layer_w):
+            x_ctr = (w_idx * stride[0]) + offset * (stride[0] - 1)
+            y_ctr = (h_idx * stride[1]) + offset * (stride[1] - 1)
+            idx = 0
+            for r in range(len(aspect_ratios)):
+                ar = aspect_ratios[r]
+                for s in range(len(anchor_sizes)):
+                    anchor_size = anchor_sizes[s]
+                    area = stride[0] * stride[1]
+                    area_ratios = area / ar
+                    base_w = np.round(np.sqrt(area_ratios))
+                    base_h = np.round(base_w * ar)
+                    scale_w = anchor_size / stride[0]
+                    scale_h = anchor_size / stride[1]
+                    w = scale_w * base_w
+                    h = scale_h * base_h
+                    out_anchors[h_idx, w_idx, idx, :] = [
+                        (x_ctr - 0.5 * (w - 1)), (y_ctr - 0.5 * (h - 1)),
+                        (x_ctr + 0.5 * (w - 1)), (y_ctr + 0.5 * (h - 1))
+                    ]
+                    idx += 1
+
+    # set the variance.
+    out_var = np.tile(variances, (layer_h, layer_w, num_anchors, 1))
+    out_anchors = out_anchors.astype('float32')
+    out_var = out_var.astype('float32')
+    return out_anchors, out_var
+
+
+class TestAnchorGeneratorOp(OpTest):
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {'Input': self.input}
+
+        self.attrs = {
+            'anchor_sizes': self.anchor_sizes,
+            'aspect_ratios': self.aspect_ratios,
+            'stride': self.stride,
+            'offset': self.offset,
+            'variances': self.variances,
+        }
+
+        self.outputs = {'Anchors': self.out_anchors, 'Variances': self.out_var}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "anchor_generator"
+        self.set_data()
+
+    def init_test_params(self):
+        self.batch_size = 1
+        self.input_channels = 2
+        self.layer_h = 2
+        self.layer_w = 2
+
+        self.anchor_sizes = [64., 128., 256., 512.]
+        self.aspect_ratios = [0.5, 1., 2.]
+        self.stride = [16., 16.]
+
+        self.offset = 0.5
+
+        self.variances = [0.1, 0.1, 0.2, 0.2]
+
+    def init_test_input(self):
+        self.input = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_h,
+             self.layer_w)).astype('float32')
+
+    def init_test_output(self):
+        self.out_anchors, self.out_var = anchor_generator_in_python(
+            self.input, self.anchor_sizes, self.aspect_ratios, self.variances,
+            self.stride, self.offset)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_argsort_op.py b/python/paddle/fluid/tests/unittests/test_argsort_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b29a102a3880406156481fdac54ca7043d3415db
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py
@@ -0,0 +1,56 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestArgsortOp(OpTest):
+    def setUp(self):
+        self.init_axis()
+        x = np.random.random((2, 3, 4, 5, 10)).astype("float32")
+        if self.axis < 0:
+            self.axis = self.axis + len(x.shape)
+        self.indices = np.argsort(x, kind='quicksort', axis=self.axis)
+        self.out = np.sort(x, kind='quicksort', axis=self.axis)
+        self.op_type = "argsort"
+        self.inputs = {'X': x}
+        self.attrs = {'axis': self.axis}
+        self.outputs = {'Indices': self.indices, 'Out': self.out}
+
+    def init_axis(self):
+        self.axis = -1
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestArgsortOpAxis0(TestArgsortOp):
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestArgsortOpAxis1(TestArgsortOp):
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestArgsortOpAxisNeg2(TestArgsortOp):
+    def init_axis(self):
+        self.axis = -2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py
new file mode 100644
index 0000000000000000000000000000000000000000..b558d7c2ea172d9c7526c865a4bc54c32f8998b6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import numpy as np
+
+
+class TestDataBalance(unittest.TestCase):
+    def prepare_data(self):
+        def fake_data_generator():
+            for n in xrange(self.total_ins_num):
+                yield np.ones((3, 4)) * n, n
+
+        # Prepare data
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(
+                fake_data_generator, batch_size=self.batch_size)
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    fluid.layers.data(
+                        name='image', shape=[3, 4], dtype='float32'),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+                self.data_file_name, reader, feeder)
+
+    def prepare_lod_data(self):
+        def fake_data_generator():
+            for n in xrange(1, self.total_ins_num + 1):
+                d1 = (np.ones((n, 3)) * n).astype('float32')
+                d2 = (np.array(n).reshape((1, 1))).astype('int32')
+                yield d1, d2
+
+        # Prepare lod data
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            with fluid.recordio_writer.create_recordio_writer(
+                    filename=self.lod_data_file_name) as writer:
+                eof = False
+                generator = fake_data_generator()
+                while (not eof):
+                    data_batch = [
+                        np.array([]).reshape((0, 3)), np.array([]).reshape(
+                            (0, 1))
+                    ]
+                    lod = [0]
+                    for _ in xrange(self.batch_size):
+                        try:
+                            ins = generator.next()
+                        except StopIteration:
+                            eof = True
+                            break
+                        for i, d in enumerate(ins):
+                            data_batch[i] = np.concatenate(
+                                (data_batch[i], d), axis=0)
+                        lod.append(lod[-1] + ins[0].shape[0])
+                    if data_batch[0].shape[0] > 0:
+                        for i, d in enumerate(data_batch):
+                            t = fluid.LoDTensor()
+                            t.set(data_batch[i], fluid.CPUPlace())
+                            if i == 0:
+                                t.set_lod([lod])
+                            writer.append_tensor(t)
+                        writer.complete_append_tensor()
+
+    def setUp(self):
+        self.use_cuda = fluid.core.is_compiled_with_cuda()
+        self.data_file_name = './data_balance_test.recordio'
+        self.lod_data_file_name = './data_balance_with_lod_test.recordio'
+        self.total_ins_num = 50
+        self.batch_size = 10
+        self.prepare_data()
+        self.prepare_lod_data()
+
+    def main(self):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        with fluid.program_guard(main_prog, startup_prog):
+            data_reader = fluid.layers.io.open_files(
+                filenames=[self.data_file_name],
+                shapes=[[-1, 3, 4], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            if self.use_cuda:
+                data_reader = fluid.layers.double_buffer(data_reader)
+            image, label = fluid.layers.read_file(data_reader)
+
+            place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+
+            parallel_exe = fluid.ParallelExecutor(
+                use_cuda=self.use_cuda, main_program=main_prog)
+
+            if (parallel_exe.device_count > self.batch_size):
+                print("WARNING: Unittest TestDataBalance skipped. \
+                    For the result is not correct when device count \
+                    is larger than batch size.")
+                exit(0)
+            fetch_list = [image.name, label.name]
+
+            data_appeared = [False] * self.total_ins_num
+            while (True):
+                try:
+                    image_val, label_val = parallel_exe.run(fetch_list,
+                                                            return_numpy=True)
+                except fluid.core.EnforceNotMet as ex:
+                    self.assertIn("There is no next data.", ex.message)
+                    break
+                ins_num = image_val.shape[0]
+                broadcasted_label = np.ones(
+                    (ins_num, 3, 4)) * label_val.reshape((ins_num, 1, 1))
+                self.assertEqual(image_val.all(), broadcasted_label.all())
+                for l in label_val:
+                    self.assertFalse(data_appeared[l[0]])
+                    data_appeared[l[0]] = True
+            for i in data_appeared:
+                self.assertTrue(i)
+
+    def main_lod(self):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        with fluid.program_guard(main_prog, startup_prog):
+            data_reader = fluid.layers.io.open_files(
+                filenames=[self.lod_data_file_name],
+                shapes=[[-1, 3], [-1, 1]],
+                lod_levels=[1, 0],
+                dtypes=['float32', 'int32'],
+                thread_num=1)
+            ins, label = fluid.layers.read_file(data_reader)
+
+            place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+
+            parallel_exe = fluid.ParallelExecutor(
+                use_cuda=self.use_cuda, main_program=main_prog)
+
+            if (parallel_exe.device_count > self.batch_size):
+                print("WARNING: Unittest TestDataBalance skipped. \
+                    For the result is not correct when device count \
+                    is larger than batch size.")
+                exit(0)
+            fetch_list = [ins.name, label.name]
+
+            data_appeared = [False] * self.total_ins_num
+            while (True):
+                try:
+                    ins_tensor, label_tensor = parallel_exe.run(
+                        fetch_list, return_numpy=False)
+                except fluid.core.EnforceNotMet as ex:
+                    self.assertIn("There is no next data.", ex.message)
+                    break
+
+                ins_val = np.array(ins_tensor)
+                label_val = np.array(label_tensor)
+                ins_lod = ins_tensor.lod()[0]
+                self.assertEqual(ins_val.shape[1], 3)
+                self.assertEqual(label_val.shape[1], 1)
+                self.assertEqual(len(ins_lod) - 1, label_val.shape[0])
+                for i in range(0, len(ins_lod) - 1):
+                    ins_elem = ins_val[ins_lod[i]:ins_lod[i + 1]][:]
+                    label_elem = label_val[i][0]
+                    self.assertEqual(ins_elem.all(), label_elem.all())
+                    self.assertFalse(data_appeared[int(label_elem - 1)])
+                    data_appeared[int(label_elem - 1)] = True
+
+            for i in data_appeared:
+                self.assertTrue(i)
+
+    def test_all(self):
+        self.main()
+        self.main_lod()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index b4379ad447e01683325dfcbb6a5b322f0b8eac3d..75b4b4e50da04521021dcb1e97cfe495f2619433 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -15,51 +15,248 @@
 import unittest
 import paddle.fluid as fluid
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
+import traceback
 
-from transpiler_test import TranspilerTest
 
-
-class TestDistTranspiler(TranspilerTest):
+class TranspilerTest(unittest.TestCase):
     def setUp(self):
-        self.current_pserver_ep = "127.0.0.1:6174"
+        self.trainer_id = 0
+        self.trainers = 2
+        self.pservers = 2
+        # NOTE: we do not actually bind this port
+        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
+        self.pserver1_ep = "127.0.0.1:6174"
+        self.pserver2_ep = "127.0.0.1:6175"
+        self.slice_var_up = True
+        self.sync_mode = True
+        self.transpiler = None
+
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def get_main_program(self):
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            self.net_conf()
+        self.origin_prog = main.clone()
+        return main
+
+    def get_trainer(self):
+        t = self._transpiler_instance()
+        return t.get_trainer_program()
+
+    def get_pserver(self, ep):
+        t = self._transpiler_instance()
+        pserver = t.get_pserver_program(ep)
+        startup = t.get_startup_program(ep, pserver)
+        return pserver, startup
+
+    def _transpiler_instance(self):
+        if not self.transpiler:
+            main = self.get_main_program()
+            self.transpiler = fluid.DistributeTranspiler()
+            self.transpiler.transpile(
+                self.trainer_id,
+                program=main,
+                pservers=self.pserver_eps,
+                trainers=self.trainers,
+                slice_var_up=self.slice_var_up,
+                sync_mode=self.sync_mode)
+        return self.transpiler
 
+
+class TestBasicModel(TranspilerTest):
     def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        pserver2, startup2 = self.get_pserver(self.pserver2_ep)
+
         trainer = self.get_trainer()
-        pserver, startup = self.get_pserver(self.current_pserver_ep)
-        self.assertEqual([op.type for op in trainer.global_block().ops],
-                         self.get_expect_trainer_ops())
+
+        self.assertEqual([op.type for op in trainer.global_block().ops], [
+            'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
+            'fill_constant', 'mean_grad', 'square_grad', 'elementwise_sub_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'split_byref', 'send',
+            'send_barrier', 'recv', 'recv', 'fetch_barrier', 'concat'
+        ])
 
         self.assertEqual(len(pserver.blocks), 3)
         # block0: listen_and_serv
         self.assertEqual([op.type for op in pserver.blocks[0].ops],
                          ["listen_and_serv"])
-        # block2: optimize pass
+        # block1~2: optimize pass
         self.assertEqual([op.type for op in pserver.blocks[1].ops],
                          ["sum", "scale", "sgd"])
-
         # confirm startup program
-
-        self.assertEqual([op.type for op in startup.global_block().ops], [
-            "fill_constant", "fill_constant", "uniform_random", "uniform_random"
-        ])
-
+        self.assertEqual([op.type for op in startup.global_block().ops],
+                         ["fill_constant", "fill_constant", "uniform_random"])
         # the variable #fc_w will be split into two blocks
         fc_w_var = startup.global_block().var("fc_w.block1")
         self.assertEqual(fc_w_var.shape, (500, 1000))
+        # all parameters should be optimized on pserver
+
+        pserver_params = []
+        for prog in [pserver, pserver2]:
+            for blk in prog.blocks:
+                for op in blk.ops:
+                    if "Param" in op.input_names:
+                        param_name = op.input("Param")[0]
+                        is_block_idx = param_name.find(".block")
+                        if is_block_idx != -1:
+                            origin_param_name = param_name[:is_block_idx]
+                        else:
+                            origin_param_name = param_name
+                        pserver_params.append(origin_param_name)
+        trainer_params = []
+        for op in self.origin_prog.global_block().ops:
+            if "Param" in op.input_names:
+                trainer_params.append(op.input("Param")[0])
+        self.assertEqual(set(pserver_params), set(trainer_params))
+
+
+class TestNoSliceVar(TranspilerTest):
+    def setUp(self):
+        super(TestNoSliceVar, self).setUp()
+        self.slice_var_up = False
+
+    def test_transpiler(self):
+        _, startup = self.get_pserver(self.pserver1_ep)
+        _, startup2 = self.get_pserver(self.pserver2_ep)
+
+        if startup.global_block().vars.has_key("fc_w"):
+            fc_w_var = startup.global_block().vars["fc_w"]
+        elif startup2.global_block().vars.has_key("fc_w"):
+            fc_w_var = startup2.global_block().vars["fc_w"]
+
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
 
-    def get_expect_trainer_ops(self):
-        trainer = fluid.Program()
 
-        with fluid.program_guard(trainer):
-            optimize_ops, params_grads = self.net_conf()
+class TestLRDecay(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(
+            learning_rate=fluid.layers.exponential_decay(
+                learning_rate=1.0,
+                decay_steps=2100,
+                decay_rate=0.1,
+                staircase=True))
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer = self.get_trainer()
+
+        self.assertEqual(len(pserver.blocks), 4)
+        lr_decay_ops = [op.type for op in pserver.blocks[1].ops]
+        self.assertEqual(lr_decay_ops, [
+            "increment", "cast", "fill_constant", "elementwise_div", "floor",
+            "fill_constant", "elementwise_pow", "fill_constant",
+            "elementwise_mul"
+        ])
+
+
+class TestLRDecayConditional(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(
+            learning_rate=fluid.layers.piecewise_decay([10000, 20000],
+                                                       [1.0, 0.5, 1.0]))
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer = self.get_trainer()
+
+        serv_op = pserver.blocks[0].ops[0]
+        sub_blocks = []
+        optimize_blocks = []
+        for b in serv_op.attrs["optimize_blocks"]:
+            optimize_blocks.append(b.idx)
+        for b in pserver.blocks:
+            if b.idx not in optimize_blocks:
+                sub_blocks.append(b.idx)
+
+        self.assertEqual(len(pserver.blocks), 7)
+        lr_decay_ops = [op.type for op in pserver.blocks[1].ops]
+        self.assertEqual(lr_decay_ops, [
+            "increment", "cast", "fill_constant", "fill_constant", "less_than",
+            "logical_not", "conditional_block", "fill_constant",
+            "fill_constant", "less_than", "logical_not", "logical_and",
+            "logical_and", "conditional_block", "fill_constant",
+            "conditional_block"
+        ])
+        # test the condition blocks
+        for b in sub_blocks:
+            if b == 0:
+                continue
+            block = pserver.blocks[b]
+            self.assertEqual([op.type for op in block.ops], ["assign"])
+
+
+class TestL2Decay(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(
+            input=x,
+            size=1000,
+            act=None,
+            param_attr=fluid.ParamAttr(
+                name='fc_w',
+                regularizer=fluid.regularizer.L2Decay(),
+                gradient_clip=fluid.clip.GradientClipByValue(0.1)),
+            bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer = self.get_trainer()
+
+        self.assertEqual(len(pserver.blocks), 3)
+        self.assertEqual([op.type for op in pserver.blocks[1].ops],
+                         ["sum", "scale", "clip", "sgd"])
+        self.assertEqual(
+            [op.type for op in pserver.blocks[2].ops],
+            ["sum", "scale", "clip", "scale", "elementwise_add", "sgd"])
+        # TODO(typhoonzero): test clipping and L2Decay ops are removed from trainer
+
 
-        delete_ops(trainer.global_block(), optimize_ops)
-        ops = [op.type for op in trainer.global_block().ops] + [
-            "split_byref", "send", "send_barrier", "recv", "recv",
-            "fetch_barrier", "concat"
-        ]
-        ops.insert(ops.index("elementwise_add_grad") + 1, "send")
-        return ops
+    # FIXME(typhoonzero): need to add test for async case:
+    # see https://github.com/PaddlePaddle/Paddle/issues/11691
+class TestAsyncSGD(TranspilerTest):
+    pass
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
new file mode 100644
index 0000000000000000000000000000000000000000..712fd5849d80b1915ae3b2ae5108bedee8d88a2c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -0,0 +1,203 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+
+IS_SPARSE = True
+EMBED_SIZE = 32
+HIDDEN_SIZE = 256
+N = 5
+BATCH_SIZE = 32
+ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
+
+
+def get_model():
+    def __network__(words):
+        embed_first = fluid.layers.embedding(
+            input=words[0],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_second = fluid.layers.embedding(
+            input=words[1],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_third = fluid.layers.embedding(
+            input=words[2],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_forth = fluid.layers.embedding(
+            input=words[3],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+
+        concat_embed = fluid.layers.concat(
+            input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+        hidden1 = fluid.layers.fc(input=concat_embed,
+                                  size=HIDDEN_SIZE,
+                                  act='sigmoid')
+        predict_word = fluid.layers.fc(input=hidden1,
+                                       size=dict_size,
+                                       act='softmax')
+        cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
+        avg_cost = fluid.layers.mean(cost)
+        return avg_cost, predict_word
+
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+
+    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    avg_cost, predict_word = __network__(
+        [first_word, second_word, third_word, forth_word, next_word])
+
+    inference_program = paddle.fluid.default_main_program().clone()
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
+
+    return inference_program, avg_cost, train_reader, test_reader, predict_word
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+def run_pserver(pserver_endpoints, trainers, current_endpoint):
+    get_model()
+    t = get_transpiler(0,
+                       fluid.default_main_program(), pserver_endpoints,
+                       trainers)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+
+    exe.run(pserver_prog)
+
+
+class TestDistMnist(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 1
+        self._pservers = 1
+        self._ps_endpoints = "127.0.0.1:9123"
+
+    def start_pserver(self, endpoint):
+        p = Process(
+            target=run_pserver,
+            args=(self._ps_endpoints, self._trainers, endpoint))
+        p.start()
+        return p.pid
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 5
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(1)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def stop_pserver(self, pid):
+        os.kill(pid, signal.SIGKILL)
+
+    def test_with_place(self):
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        pserver_pid = self.start_pserver(self._ps_endpoints)
+        self._wait_ps_ready(pserver_pid)
+
+        self.run_trainer(p, 0)
+
+        self.stop_pserver(pserver_pid)
+
+    def run_trainer(self, place, trainer_id):
+        test_program, avg_cost, train_reader, test_reader, predict = get_model()
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), self._ps_endpoints,
+                           self._trainers)
+
+        trainer_prog = t.get_trainer_program()
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        use_gpu = True if core.is_compiled_with_cuda() else False
+
+        exec_strategy = ExecutionStrategy()
+        exec_strategy.use_cuda = use_gpu
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=use_gpu,
+            main_program=trainer_prog,
+            loss_name=avg_cost.name,
+            exec_strategy=exec_strategy)
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        for pass_id in xrange(10):
+            for batch_id, data in enumerate(train_reader()):
+                avg_loss_np = train_exe.run(feed=feeder.feed(data),
+                                            fetch_list=[avg_cost.name])
+                loss = np.array(avg_loss_np).mean()
+                if float(loss) < 5.0:
+                    return
+                if math.isnan(loss):
+                    assert ("Got Nan loss, training failed")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py
deleted file mode 100644
index 23871508d8042ade5253c2f0b3bc9f32ec71a135..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import paddle.fluid.core as core
-import numpy
-import paddle.fluid.layers as layers
-from paddle.fluid.framework import Program, program_guard
-from paddle.fluid.executor import Executor
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-
-
-class TestFillZerosLikeOpForTensorArray(unittest.TestCase):
-    def place(self):
-        return core.CPUPlace()
-
-    def test_zero_filling_lod_tensor_array(self):
-        tensor = core.LoDTensor()
-        tensor.set(
-            numpy.arange(20).reshape(20, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 2, 5], [0, 3, 9, 11, 17, 20]])
-
-        expect = [
-            numpy.array(
-                [0, 0, 0, 0, 0], dtype='int32'), numpy.array(
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int32'),
-            numpy.array(
-                [0, 0, 0], dtype='int32')
-        ]
-
-        lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
-        self.main(
-            tensor=tensor,
-            expect_array=expect,
-            expect_lod=lod,
-            expect_max_len=3)
-
-    def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
-        place = self.place()
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[10])
-            x.persistable = True
-            table = layers.lod_rank_table(x, level=level)
-            max_len = layers.max_sequence_len(table)
-            max_len.persistable = True
-            array = layers.lod_tensor_to_array(x, table)
-            array = layers.fill_zeros_like(array)
-            array.persistable = True
-
-            result = layers.array_to_lod_tensor(array, table)
-            result.persistable = True
-        exe = Executor(place)
-        scope = core.Scope()
-        exe.run(program, feed={'x': tensor}, scope=scope)
-        var = scope.find_var(array.name)
-        array = var.get_lod_tensor_array()
-        if expect_array is not None and expect_lod is not None:
-            self.check_array_same(array, expect_array, expect_lod)
-
-        self.assertEqual(
-            numpy.array(scope.find_var(max_len.name).get_tensor())[0],
-            expect_max_len)
-
-    def check_array_same(self, array, expect_tensor, expect_lod):
-        self.assertEqual(len(expect_tensor), len(array))
-        for i, exp in enumerate(zip(expect_tensor, expect_lod)):
-            exp_tensor, exp_lod = exp
-            exp_tensor = numpy.expand_dims(exp_tensor, axis=1)
-            self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i])))
-            self.assertEqual(exp_lod, array[i].lod())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 9d4b2d4434f3ec9cb62acd8b0e08dfea16279320..842d34c07e94a79e3351347e2528ecc478cc56dc 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -419,6 +419,15 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(iou)
         print(str(program))
 
+    def test_argsort(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='x', shape=[2, 3, 3], dtype="float32")
+            out, ids = layers.argsort(input=data, axis=1)
+            self.assertIsNotNone(out)
+            self.assertIsNotNone(ids)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index 252793944462244539084a288e5259f216359650..9a2733927d38f1a2b1af92fcc12f036158b4d06f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -16,6 +16,8 @@ import paddle.fluid as fluid
 import numpy as np
 import unittest
 import os
+import sys
+import math
 
 
 def simple_fc_net():
@@ -73,6 +75,14 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
 
                 train_loss, = train_exe.run([loss.name], feed=feed_dict)
 
+                avg_test_loss_val = np.array(test_loss).mean()
+                if math.isnan(float(avg_test_loss_val)):
+                    sys.exit("got NaN loss, testing failed.")
+
+                avg_train_loss_val = np.array(train_loss).mean()
+                if math.isnan(float(avg_train_loss_val)):
+                    sys.exit("got NaN loss, training failed.")
+
                 self.assertTrue(
                     np.allclose(
                         train_loss, test_loss, atol=1e-8),
diff --git a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
deleted file mode 100644
index f4aa7426bc315be501348a64e2f15caed6dc8810..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-import paddle.fluid as fluid
-from paddle.fluid.transpiler.distribute_transpiler import delete_ops
-
-from transpiler_test import TranspilerTest
-
-
-class TestSimpleDistTranspiler(TranspilerTest):
-    def setUp(self):
-        self.current_pserver_ep = "127.0.0.1:6175"
-
-    def test_simple_transpiler(self):
-        np.random.seed(1)
-
-        trainer = self.get_trainer()
-        pserver, startup = self.get_pserver(self.current_pserver_ep)
-        self.assertEqual([op.type for op in trainer.global_block().ops],
-                         self.get_expect_trainer_ops())
-
-        self.assertEqual(len(pserver.blocks), 2)
-        # block0: listen_and_serv
-        self.assertEqual([op.type for op in pserver.blocks[0].ops],
-                         ["listen_and_serv"])
-        # block1: optimize pass
-        self.assertEqual([op.type for op in pserver.blocks[1].ops],
-                         ["sum", "scale", "sgd"])
-
-        # confirm startup program
-        self.assertEqual([op.type for op in startup.global_block().ops],
-                         ["fill_constant", "uniform_random", "uniform_random"])
-
-        # the variable #fc_w will NOT be splited
-        fc_w_var = startup.global_block().var("fc_w@GRAD")
-        self.assertEqual(fc_w_var.shape, (1000, 1000))
-
-        fc_w_var = startup.global_block().var("fc_w@GRAD.trainer_0")
-        self.assertEqual(fc_w_var.shape, (1000, 1000))
-
-    def get_expect_trainer_ops(self):
-        trainer = fluid.Program()
-
-        with fluid.program_guard(trainer):
-            optimize_ops, params_grads = self.net_conf()
-
-        delete_ops(trainer.global_block(), optimize_ops)
-        ops = [op.type for op in trainer.global_block().ops] + [
-            "send", "send_barrier", "recv", "recv", "fetch_barrier"
-        ]
-        ops.insert(ops.index("elementwise_add_grad") + 1, "send")
-        return ops
-
-    def _transpiler_instance(self):
-        main = self.get_main_program()
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            self.trainer_id,
-            program=main,
-            pservers=self.pserver_eps,
-            trainers=self.trainers,
-            slice_var_up=False)
-        return t
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/transpiler_test.py b/python/paddle/fluid/tests/unittests/transpiler_test.py
deleted file mode 100644
index d84c5d9c41c705cf6d14cc0b5a8c692b0d646337..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/transpiler_test.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
-
-
-class TranspilerTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.trainer_id = 0
-        self.trainers = 2
-        self.pservers = 2
-        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
-
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'))
-
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-
-        optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
-        return optimize_ops, params_grads
-
-    def get_main_program(self):
-        main = fluid.Program()
-
-        with fluid.program_guard(main):
-            self.net_conf()
-
-        return main
-
-    def get_trainer(self):
-        return self._transpiler_instance().get_trainer_program()
-
-    def get_pserver(self, ep):
-        t = self._transpiler_instance()
-        pserver = t.get_pserver_program(ep)
-        startup = t.get_startup_program(ep, pserver)
-        return pserver, startup
-
-    def _transpiler_instance(self):
-        main = self.get_main_program()
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            self.trainer_id,
-            program=main,
-            pservers=self.pserver_eps,
-            trainers=self.trainers)
-        return t
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 343901cda3f505c3b3d2ed0c30cf7fea71c8b6b1..05fed72ee6471ba42007b5a9f09f89148ac27a30 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -455,6 +455,8 @@ class DistributeTranspiler(object):
                     __append_optimize_op__(op, per_opt_block, grad_to_block_id,
                                            merged_var, lr_ops)
 
+        # dedup grad to ids list
+        grad_to_block_id = list(set(grad_to_block_id))
         # append global ops
         if global_ops:
             opt_state_block = pserver_program.create_block(
@@ -960,8 +962,6 @@ class DistributeTranspiler(object):
             if not block_map.has_key(varname):
                 block_map[varname] = []
             block_map[varname].append((long(offset), long(size)))
-        # Do not remove this important debug message:
-        print("block map: %s" % block_map)
 
         for varname, splited in block_map.iteritems():
             orig_var = program.global_block().var(varname)
@@ -1401,6 +1401,16 @@ class DistributeTranspiler(object):
                     break
         return lr_ops
 
+    def _is_opt_role_op(self, op):
+        # NOTE: depend on oprole to find out whether this op is for
+        # optimize
+        op_maker = core.op_proto_and_checker_maker
+        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
+        if op_maker.kOpRoleAttrName() in op.attrs and \
+            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+            return True
+        return False
+
     def _get_optimize_pass(self):
         """
         Get optimizer operators, paramters and gradients from origin_program
@@ -1413,10 +1423,7 @@ class DistributeTranspiler(object):
         params_grads = []
         origin_var_dict = self.origin_program.global_block().vars
         for op in block.ops:
-            # NOTE(Yancey1989): we can not use op role to distinguish an optimizer op
-            # or not, because all ops in optimizer sub-graph would
-            # sign the optimizer op role
-            if self._is_optimizer_op(op):
+            if self._is_opt_role_op(op):
                 opt_ops.append(op)
                 # HACK(wuyi): if we find grad vars from input of optimize
                 # ops, we may get the output of clip op. Use syntax "@GRAD"
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index e6a03759ef431086390e217eabcdff47e610346c..d9787ef42a31b8dfd1836e7a01d5664049cc66b5 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -4182,9 +4182,9 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
 
     You can see following configs for further usages:
 
-    - time steps: lstmemory_group, paddle/gserver/tests/sequence_layer_group.conf, \
+    - time steps: lstmemory_group, paddle/legacy/gserver/tests/sequence_layer_group.conf, \
                   demo/seqToseq/seqToseq_net.py
-    - sequence steps: paddle/gserver/tests/sequence_nest_layer_group.conf
+    - sequence steps: paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
 
     :param step: A step function which takes the input of recurrent_group as its own
                  input and returns values as recurrent_group's output every time step.
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 14b64742fd09bf6c197c5d1aa2354271293df239..28ee042282a08be32c13d91312fd97b211277522 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -63,7 +63,7 @@ class Inference(object):
             assert isinstance(val, api.Vector)
             val.copyFromNumpyArray(parameters.get(name).flatten())
             # the setValueUpdated function is called in randomize, zeroMem,
-            # load function in paddle/parameter/Parameter.cpp. But in the
+            # load function in paddle/legacy/parameter/Parameter.cpp. But in the
             # inference mode, the setValueUpdated is never called, it will
             # cause the parameter will not be dispatched
             # in MultiGradientMachine for multi-GPU. So setValueUpdated is
diff --git a/python/setup.py.in b/python/setup.py.in
index 8257f1d5e212a84188a4c51bc2d0f4d4c7af91fb..032784f4a2ae8f3368e8ed4690f3482f0deae557 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -95,7 +95,7 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
     paddle_bin_dir = 'opt/paddle/bin'
     paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
                    '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
-                   '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
+                   '${PADDLE_BINARY_DIR}/paddle/legacy/pserver/paddle_pserver_main',
                    '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 
 package_data={'paddle.fluid': ['core.so']}
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index a9775e10ef51fae493523149ee3dbbf227a1aaa9..041ba868afad62ec4e45f537fe4a19c9bfb3a301 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -4,7 +4,7 @@ TOTAL_ERRORS=0
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
-    if [[ $file =~ ^(paddle/api/.*|paddle/capi/.*|paddle/contrib/.*|paddle/cuda/.*|paddle/function/.*|paddle/gserver/.*|paddle/math/.*|paddle/optimizer/.*|paddle/parameter/.*|paddle/pserver/.*|paddle/trainer/.*|paddle/utils/.*) ]]; then
+    if [[ $file =~ ^(paddle/api/.*|paddle/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/trainer/.*|paddle/utils/.*|paddle/testing/TestUtil.*) ]]; then
         continue;
     else
         cpplint --filter=-readability/fn_size $file;